ktransformers/kt-kernel/python/utils/loader.py

"""
Weight loaders for different formats.

This module provides loaders for:
- SafeTensor format (for AMX quantized weights)
- GGUF format (for Llamafile quantized weights)
"""

from __future__ import annotations

import os
import numpy as np
import torch
from enum import IntEnum
from safetensors import safe_open
from gguf.gguf_reader import GGUFReader


class GGMLQuantizationType(IntEnum):
    """GGML quantization type enumeration"""

    F32 = 0
    F16 = 1
    Q4_0 = 2
    Q4_1 = 3
    Q5_0 = 6
    Q5_1 = 7
    Q8_0 = 8
    Q8_1 = 9
    Q2_K = 10
    Q3_K = 11
    Q4_K = 12
    Q5_K = 13
    Q6_K = 14
    Q8_K = 15
    IQ2_XXS = 16
    IQ2_XS = 17
    IQ3_XXS = 18
    IQ1_S = 19
    IQ4_NL = 20
    IQ3_S = 21
    IQ2_S = 22
    IQ4_XS = 23
    I8 = 24
    I16 = 25
    I32 = 26
    I64 = 27
    F64 = 28
    IQ1_M = 29
    BF16 = 30


def translate_name_to_gguf(name):
    """
    Translate PyTorch tensor name to GGUF format
    """
    name = name.replace("lm_head.", "output.")
    name = name.replace("model.embed_tokens.", "token_embd.")
    name = name.replace("model.norm.", "output_norm.")
    name = name.replace("model.layers.", "blk.")
    name = name.replace(".input_layernorm", ".attn_norm")
    name = name.replace(".mlp.down_proj", ".ffn_down")
    name = name.replace(".mlp.gate_proj", ".ffn_gate")
    name = name.replace(".mlp.up_proj", ".ffn_up")
    name = name.replace(".post_attention_layernorm", ".ffn_norm")
    name = name.replace(".self_attn.q_proj", ".attn_q")
    name = name.replace(".self_attn.k_proj", ".attn_k")
    name = name.replace(".self_attn.v_proj", ".attn_v")
    name = name.replace(".self_attn.o_proj", ".attn_output")
    name = name.replace(".self_attn.qkv_proj", ".attn_qkv")
    name = name.replace(".self_attn.kv_a_proj_with_mqa", ".attn_kv_a_mqa")
    name = name.replace(".self_attn.kv_a_layernorm", ".attn_kv_a_norm")
    name = name.replace(".self_attn.kv_b_proj", ".attn_kv_b")
    name = name.replace(".self_attn.q_a_proj", ".attn_q_a")
    name = name.replace(".self_attn.q_a_layernorm", ".attn_q_a_norm")
    name = name.replace(".self_attn.q_b_proj", ".attn_q_b")
    name = name.replace(".self_attn.q_norm", ".attn_q_norm")
    name = name.replace(".self_attn.k_norm", ".attn_k_norm")
    name = name.replace(".shared_expert.", ".shared_experts.")
    name = name.replace(".shared_expert_", ".shared_experts_")
    name = name.replace(".gate_up_proj.", ".up_proj")
    name = name.replace(".mlp.shared_experts.down_proj", ".ffn_down_shexp")
    name = name.replace(".mlp.gate.e_score_correction_bias", ".exp_probs_b.bias")
    name = name.replace(".mlp.gate", ".ffn_gate_inp")
    name = name.replace(".mlp.shared_experts.gate_proj", ".ffn_gate_shexp")
    name = name.replace(".mlp.shared_experts.up_proj", ".ffn_up_shexp")
    name = name.replace(".mlp.shared_experts_gate", ".ffn_gate_inp_shexp")
    name = name.replace(".mlp.experts", "")
    name = name.replace(".mlp.experts.ffn_down_exps", ".ffn_down_exps")
    name = name.replace(".mlp.experts.ffn_gate_exps", ".ffn_gate_exps")
    name = name.replace(".mlp.experts.ffn_up_exps", ".ffn_up_exps")
    name = name.replace(".block_sparse_moe.gate.", ".ffn_gate_inp.")
    name = name.replace(".block_sparse_moe.experts", "")
    name = name.replace(".feed_forward.experts", "")
    name = name.replace(".feed_forward.router", ".ffn_gate_inp")
    name = name.replace(".feed_forward.shared_experts.down_proj", ".ffn_down_shexp")
    name = name.replace(".feed_forward.shared_experts.gate_proj", ".ffn_gate_shexp")
    name = name.replace(".feed_forward.shared_experts.up_proj", ".ffn_up_shexp")
    return name


class SafeTensorLoader:
    """
    SafeTensor format loader for AMX quantized weights.

    Supports loading tensors from .safetensors files with NUMA-sharded expert weights.
    """

    tensor_file_map: dict
    tensor_type_map: dict
    file_handle_map: dict
    tensor_device_map: dict

    def __init__(self, file_path: str):
        self.__load_tensor_file_map(file_path)

    def __load_tensor_file_map(self, file_path: str):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Path not found: {file_path}")
        if os.path.isfile(file_path):
            folder_path = os.path.dirname(file_path)
        else:
            folder_path = file_path
        self.file_handle_map = {}
        self.tensor_file_map = {}
        self.tensor_type_map = {}
        self.tensor_device_map = {}

        found_safetensor = False
        for root, _, files in os.walk(folder_path):
            files = sorted(files)
            for file in files:
                if file.endswith(".safetensors"):
                    found_safetensor = True
                    file_path = os.path.join(root, file)
                    if file not in self.file_handle_map:
                        try:
                            handle = safe_open(file_path, framework="pt")
                            self.file_handle_map[file] = handle
                        except Exception as e:
                            print(f"Error opening Safetensor file {file_path}: {e}")
                            continue

                    f = self.file_handle_map.get(file)
                    if f is None:
                        continue
                    try:
                        for key in f.keys():
                            self.tensor_file_map[key] = file
                    except Exception as e:
                        print(f"Error reading Safetensor file {file_path}: {e}")

        if not found_safetensor:
            raise FileNotFoundError(f"No Safetensor files found in {folder_path}")

    def load_tensor(self, key: str, device: str = "cpu"):
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        return tensor.to(device)

    def close_all_handles(self):
        """Close all file handles and clear the handle map.

        Note: safetensors.safe_open doesn't expose a close() method. Releasing
        the mmap relies on reference counting: once file_handle_map is cleared
        and no tensor holds a reference to the underlying mmap region, the OS
        will reclaim the page cache. gc.collect() is called here to trigger
        immediate reclamation rather than waiting for the next GC cycle.
        """
        import gc
        self.file_handle_map.clear()
        gc.collect()

    def load_experts(self, base_key: str, device: str = "cpu"):
        """
        Load expert weights from SafeTensor files.

        Expected format:
        - blk.{layer_index}.ffn_[up, down, gate]_exps.{expert_id}.numa.{numa_id}.weight
        - blk.{layer_index}.ffn_[up, down, gate]_exps.{expert_id}.numa.{numa_id}.scale

        Args:
            base_key: Base key like "blk.{layer_index}"
            device: Target device for tensors

        Returns:
            Dictionary with keys: up, gate, down, up_scale, gate_scale, down_scale
            Each value is a list of lists: [numa_id][expert_id] -> numpy array
        """
        up_base_key = f"{base_key}.ffn_up_exps"
        gate_base_key = f"{base_key}.ffn_gate_exps"
        down_base_key = f"{base_key}.ffn_down_exps"
        max_numa_id = -1
        max_experts_count = -1
        while self.has_tensor(f"{up_base_key}.{max_experts_count+1}.numa.{0}.weight"):
            max_experts_count += 1
        if max_experts_count == 0:
            raise ValueError(f"No experts found for key {base_key}")
        while self.has_tensor(f"{up_base_key}.{0}.numa.{max_numa_id+1}.weight"):
            max_numa_id += 1
        # Initialize empty lists to store tensors for each projection type
        up_weights = [[] for _ in range(max_numa_id + 1)]
        gate_weights = [[] for _ in range(max_numa_id + 1)]
        down_weights = [[] for _ in range(max_numa_id + 1)]
        up_scales = [[] for _ in range(max_numa_id + 1)]
        gate_scales = [[] for _ in range(max_numa_id + 1)]
        down_scales = [[] for _ in range(max_numa_id + 1)]
        # Check if backward weights exist
        up_bwd_base_key = f"{base_key}.ffn_up_bwd_exps"
        gate_bwd_base_key = f"{base_key}.ffn_gate_bwd_exps"
        down_bwd_base_key = f"{base_key}.ffn_down_bwd_exps"
        has_bwd = self.has_tensor(f"{gate_bwd_base_key}.{0}.numa.{0}.weight")

        if has_bwd:
            up_bwd_weights = [[] for _ in range(max_numa_id + 1)]
            gate_bwd_weights = [[] for _ in range(max_numa_id + 1)]
            down_bwd_weights = [[] for _ in range(max_numa_id + 1)]
            up_bwd_scales = [[] for _ in range(max_numa_id + 1)]
            gate_bwd_scales = [[] for _ in range(max_numa_id + 1)]
            down_bwd_scales = [[] for _ in range(max_numa_id + 1)]

        for numa_id in range(max_numa_id + 1):
            for expert_id in range(max_experts_count + 1):
                up_key = f"{up_base_key}.{expert_id}.numa.{numa_id}.weight"
                gate_key = f"{gate_base_key}.{expert_id}.numa.{numa_id}.weight"
                down_key = f"{down_base_key}.{expert_id}.numa.{numa_id}.weight"
                up_scale_key = f"{up_base_key}.{expert_id}.numa.{numa_id}.scale"
                gate_scale_key = f"{gate_base_key}.{expert_id}.numa.{numa_id}.scale"
                down_scale_key = f"{down_base_key}.{expert_id}.numa.{numa_id}.scale"
                # make sure contiguous
                up_tensor = self.load_tensor(up_key, device).numpy()
                gate_tensor = self.load_tensor(gate_key, device).numpy()
                down_tensor = self.load_tensor(down_key, device).numpy()
                up_scale_tensor = self.load_tensor(up_scale_key, device).numpy()
                gate_scale_tensor = self.load_tensor(gate_scale_key, device).numpy()
                down_scale_tensor = self.load_tensor(down_scale_key, device).numpy()

                up_weights[numa_id].append(up_tensor)
                gate_weights[numa_id].append(gate_tensor)
                down_weights[numa_id].append(down_tensor)
                up_scales[numa_id].append(up_scale_tensor)
                gate_scales[numa_id].append(gate_scale_tensor)
                down_scales[numa_id].append(down_scale_tensor)

                # Load backward weights if available
                if has_bwd:
                    gate_bwd_weights[numa_id].append(
                        self.load_tensor(f"{gate_bwd_base_key}.{expert_id}.numa.{numa_id}.weight", device).numpy()
                    )
                    up_bwd_weights[numa_id].append(
                        self.load_tensor(f"{up_bwd_base_key}.{expert_id}.numa.{numa_id}.weight", device).numpy()
                    )
                    down_bwd_weights[numa_id].append(
                        self.load_tensor(f"{down_bwd_base_key}.{expert_id}.numa.{numa_id}.weight", device).numpy()
                    )
                    gate_bwd_scales[numa_id].append(
                        self.load_tensor(f"{gate_bwd_base_key}.{expert_id}.numa.{numa_id}.scale", device).numpy()
                    )
                    up_bwd_scales[numa_id].append(
                        self.load_tensor(f"{up_bwd_base_key}.{expert_id}.numa.{numa_id}.scale", device).numpy()
                    )
                    down_bwd_scales[numa_id].append(
                        self.load_tensor(f"{down_bwd_base_key}.{expert_id}.numa.{numa_id}.scale", device).numpy()
                    )

        result = {
            "up": up_weights,
            "gate": gate_weights,
            "down": down_weights,
            "up_scale": up_scales,
            "gate_scale": gate_scales,
            "down_scale": down_scales,
        }
        if has_bwd:
            result["gate_bwd"] = gate_bwd_weights
            result["up_bwd"] = up_bwd_weights
            result["down_bwd"] = down_bwd_weights
            result["gate_bwd_scale"] = gate_bwd_scales
            result["up_bwd_scale"] = up_bwd_scales
            result["down_bwd_scale"] = down_bwd_scales
        return result

    def has_tensor(self, name: str):
        return name in self.tensor_file_map


class FP8SafeTensorLoader(SafeTensorLoader):
    """Loader for FP8 expert weights with auto-detection of naming formats.

    Supported formats:
    - DeepSeek style: {base}.mlp.experts.{id}.{gate,up,down}_proj.weight
    - Mixtral/MiniMax style: {base}.block_sparse_moe.experts.{id}.{w1,w3,w2}.weight
    - Mistral style: {base}.experts.{id}.{w1,w3,w2}.weight

    Supported scale formats (auto-detected):
    - Block-wise: weight_scale_inv (DeepSeek FP8)
    - Per-channel: weight_scale (GLM-4.7-FP8)

    The format is auto-detected during initialization.
    """

    # Known MoE naming formats: (experts_path_template, gate_name, up_name, down_name)
    MOE_FORMATS = {
        "deepseek": ("{base}.mlp.experts", "gate_proj", "up_proj", "down_proj"),
        "mixtral": ("{base}.block_sparse_moe.experts", "w1", "w3", "w2"),
        "mistral": ("{base}.experts", "w1", "w3", "w2"),
    }

    def __init__(self, file_path: str, scale_suffix: str = None):
        """Initialize FP8 loader with optional scale suffix override.

        Args:
            file_path: Path to safetensor files
            scale_suffix: Optional scale key suffix. If None, auto-detect between
                         'weight_scale_inv' (block-wise) and 'weight_scale' (per-channel).
        """
        super().__init__(file_path)
        self._detected_format = None
        self._scale_suffix = scale_suffix  # None means auto-detect
        # Set per_channel based on explicit scale_suffix if provided
        if scale_suffix == "weight_scale":
            self._is_per_channel = True
        elif scale_suffix == "weight_scale_inv":
            self._is_per_channel = False
        else:
            self._is_per_channel = False  # Will be updated in _detect_format if auto-detect
        self._is_vl_model = False
        self._detect_format()

    def _detect_format(self):
        """Auto-detect the MoE naming format and scale format by checking tensor keys."""
        # Sample some tensor names to detect format
        sample_keys = list(self.tensor_file_map.keys())[:1000]

        for fmt_name, (path_tpl, gate, up, down) in self.MOE_FORMATS.items():
            # Check if any key matches this format pattern
            # Look for pattern like: model.layers.0.{experts_path}.0.{gate_name}.weight
            for key in sample_keys:
                if ".experts." in key and f".{gate}.weight" in key:
                    # Verify the path template matches
                    if "block_sparse_moe.experts" in key and fmt_name == "mixtral":
                        self._detected_format = fmt_name
                        print(f"[FP8SafeTensorLoader] Detected format: {fmt_name}")
                        break
                    elif "mlp.experts" in key and "block_sparse_moe" not in key and fmt_name == "deepseek":
                        self._detected_format = fmt_name
                        print(f"[FP8SafeTensorLoader] Detected format: {fmt_name}")
                        break
                    elif fmt_name == "mistral" and ".mlp.experts" not in key and ".block_sparse_moe.experts" not in key:
                        self._detected_format = fmt_name
                        print(f"[FP8SafeTensorLoader] Detected format: {fmt_name}")
                        break
            if self._detected_format:
                break

        # Default to deepseek if no format detected
        if not self._detected_format:
            self._detected_format = "deepseek"
            print("[FP8SafeTensorLoader] No MoE format detected, defaulting to: deepseek")

        # Auto-detect scale suffix if not specified
        if self._scale_suffix is None:
            _, gate, _, _ = self.MOE_FORMATS[self._detected_format]
            # Check for per-channel scale (weight_scale) vs block-wise (weight_scale_inv)
            for key in sample_keys:
                if f".{gate}.weight_scale_inv" in key:
                    self._scale_suffix = "weight_scale_inv"
                    self._is_per_channel = False
                    print("[FP8SafeTensorLoader] Detected scale format: block-wise (weight_scale_inv)")
                    if key.startswith("model.language_model.") and self._detected_format == "deepseek":
                        # VL models(Qwen3.5): model.layers.{N} -> model.language_model.layers.{N}
                        self._is_vl_model = True
                        print("[FP8SafeTensorLoader] Detected VL model")
                    return
                elif f".{gate}.weight_scale" in key and "weight_scale_inv" not in key:
                    self._scale_suffix = "weight_scale"
                    # Some models (e.g., Mistral) use block-wise FP8 scales but keep
                    # the key suffix as `weight_scale` (without `_inv`). Infer format
                    # from scale tensor shape instead of suffix alone:
                    # - per-channel: [N] or [N, 1]
                    # - block-wise: [N_block, K_block] (both dims > 1)
                    scale_tensor = self.load_tensor(key, device="cpu")
                    if scale_tensor.dim() == 1:
                        self._is_per_channel = True
                    elif scale_tensor.dim() == 2 and scale_tensor.shape[1] == 1:
                        self._is_per_channel = True
                    else:
                        self._is_per_channel = False

                    scale_kind = "per-channel" if self._is_per_channel else "block-wise"
                    print(f"[FP8SafeTensorLoader] Detected scale format: {scale_kind} (weight_scale)")
                    return
            # Default to weight_scale_inv
            self._scale_suffix = "weight_scale_inv"
            self._is_per_channel = False
            print("[FP8SafeTensorLoader] No scale format detected, defaulting to: weight_scale_inv")
        else:
            # Scale suffix was explicitly provided
            scale_type = "per-channel" if self._is_per_channel else "block-wise"
            print(f"[FP8SafeTensorLoader] Using explicit scale format: {scale_type} ({self._scale_suffix})")

    def _get_experts_prefix_candidates(self, base_key: str) -> list[str]:
        """Get candidate experts prefixes based on detected format and base key variants."""
        path_tpl, _, _, _ = self.MOE_FORMATS[self._detected_format]
        candidates = []
        if self._is_vl_model:
            base_key = base_key.replace("model.layers", "model.language_model.layers")
        candidates.append(path_tpl.format(base=base_key))

        # Some model weights (e.g., Mistral native format) do not have "model." prefix.
        if base_key.startswith("model."):
            candidates.append(path_tpl.format(base=base_key[len("model.") :]))

        # Deduplicate while preserving order.
        return list(dict.fromkeys(candidates))

    def _get_proj_names(self):
        """Get projection names (gate, up, down) based on detected format."""
        _, gate, up, down = self.MOE_FORMATS[self._detected_format]
        return gate, up, down

    def load_tensor(self, key: str, device: str = "cpu"):
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        if device == "cpu":
            return tensor
        return tensor.to(device)

    def load_experts(self, base_key: str, device: str = "cpu"):
        """Load FP8 expert weights and their scale tensors.

        Supports both block-wise (weight_scale_inv) and per-channel (weight_scale) formats.
        Per-channel scales are squeezed from [N, 1] to [N] if needed.
        """
        experts_prefix_candidates = self._get_experts_prefix_candidates(base_key)
        gate_name, up_name, down_name = self._get_proj_names()

        expert_count = 0
        experts_prefix = None
        for prefix in experts_prefix_candidates:
            expert_count = 0
            while self.has_tensor(f"{prefix}.{expert_count}.{gate_name}.weight"):
                expert_count += 1
            if expert_count > 0:
                experts_prefix = prefix
                break

        if expert_count == 0 or experts_prefix is None:
            raise ValueError(f"No experts found for keys: {experts_prefix_candidates}")

        gate_weights = [None] * expert_count
        up_weights = [None] * expert_count
        down_weights = [None] * expert_count
        gate_scales = [None] * expert_count
        up_scales = [None] * expert_count
        down_scales = [None] * expert_count

        for exp_id in range(expert_count):
            gate_w_key = f"{experts_prefix}.{exp_id}.{gate_name}.weight"
            up_w_key = f"{experts_prefix}.{exp_id}.{up_name}.weight"
            down_w_key = f"{experts_prefix}.{exp_id}.{down_name}.weight"
            gate_s_key = f"{experts_prefix}.{exp_id}.{gate_name}.{self._scale_suffix}"
            up_s_key = f"{experts_prefix}.{exp_id}.{up_name}.{self._scale_suffix}"
            down_s_key = f"{experts_prefix}.{exp_id}.{down_name}.{self._scale_suffix}"

            gate_weights[exp_id] = self.load_tensor(gate_w_key, device).contiguous()
            up_weights[exp_id] = self.load_tensor(up_w_key, device).contiguous()
            down_weights[exp_id] = self.load_tensor(down_w_key, device).contiguous()

            gate_scale = self.load_tensor(gate_s_key, device)
            up_scale = self.load_tensor(up_s_key, device)
            down_scale = self.load_tensor(down_s_key, device)

            # For per-channel scales, squeeze [N, 1] -> [N] if needed
            if self._is_per_channel:
                if gate_scale.dim() == 2 and gate_scale.shape[1] == 1:
                    gate_scale = gate_scale.squeeze(1)
                if up_scale.dim() == 2 and up_scale.shape[1] == 1:
                    up_scale = up_scale.squeeze(1)
                if down_scale.dim() == 2 and down_scale.shape[1] == 1:
                    down_scale = down_scale.squeeze(1)

            gate_scales[exp_id] = gate_scale.contiguous()
            up_scales[exp_id] = up_scale.contiguous()
            down_scales[exp_id] = down_scale.contiguous()

        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
            "gate_scale": gate_scales,
            "up_scale": up_scales,
            "down_scale": down_scales,
        }

    def is_per_channel(self) -> bool:
        """Return True if using per-channel quantization, False for block-wise."""
        return self._is_per_channel


class BF16SafeTensorLoader(SafeTensorLoader):
    """Loader for native BF16 expert weights (no quantization, no scales).

    Supported formats:
    - DeepSeek style: {base}.mlp.experts.{id}.{gate,up,down}_proj.weight
    - Mixtral/MiniMax style: {base}.block_sparse_moe.experts.{id}.{w1,w3,w2}.weight
    - Mistral style: {base}.experts.{id}.{w1,w3,w2}.weight

    The format is auto-detected during initialization.
    """

    MOE_FORMATS = {
        "deepseek": ("{base}.mlp.experts", "gate_proj", "up_proj", "down_proj"),
        "mixtral": ("{base}.block_sparse_moe.experts", "w1", "w3", "w2"),
        "mistral": ("{base}.experts", "w1", "w3", "w2"),
    }

    def __init__(self, file_path: str):
        super().__init__(file_path)
        self._detected_format = None
        self._detect_format()

    def _detect_format(self):
        """Auto-detect the MoE naming format by checking tensor keys."""
        sample_keys = list(self.tensor_file_map.keys())[:1000]

        # Check for packed format first (Qwen3.5 MoE style: all experts in one 3D tensor)
        for key in sample_keys:
            if key.endswith(".mlp.experts.gate_up_proj"):
                self._detected_format = "packed"
                print("[BF16SafeTensorLoader] Detected format: packed (Qwen3.5 MoE style)")
                return

        for fmt_name, (path_tpl, gate, up, down) in self.MOE_FORMATS.items():
            for key in sample_keys:
                if ".experts." in key and f".{gate}.weight" in key:
                    if "block_sparse_moe.experts" in key and fmt_name == "mixtral":
                        self._detected_format = fmt_name
                        print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}")
                        return
                    elif "mlp.experts" in key and "block_sparse_moe" not in key and fmt_name == "deepseek":
                        self._detected_format = fmt_name
                        print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}")
                        return
                    elif fmt_name == "mistral" and ".mlp.experts" not in key and ".block_sparse_moe.experts" not in key:
                        self._detected_format = fmt_name
                        print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}")
                        return

        self._detected_format = "deepseek"
        print("[BF16SafeTensorLoader] No MoE format detected, defaulting to: deepseek")

    def _get_experts_prefix_candidates(self, base_key: str) -> list[str]:
        """Get candidate experts prefixes based on detected format and base key variants."""
        path_tpl, _, _, _ = self.MOE_FORMATS[self._detected_format]
        candidates = [path_tpl.format(base=base_key)]

        # Some model weights (e.g., Mistral native format) do not have "model." prefix.
        if base_key.startswith("model."):
            candidates.append(path_tpl.format(base=base_key[len("model.") :]))

        return list(dict.fromkeys(candidates))

    def _get_proj_names(self):
        """Get projection names (gate, up, down) based on detected format."""
        _, gate, up, down = self.MOE_FORMATS[self._detected_format]
        return gate, up, down

    def load_tensor(self, key: str, device: str = "cpu"):
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        if device == "cpu":
            return tensor
        return tensor.to(device)

    def load_experts(self, base_key: str, device: str = "cpu"):
        """Load BF16 expert weights (no scales needed)."""
        if self._detected_format == "packed":
            return self._load_experts_packed(base_key, device)

        experts_prefix_candidates = self._get_experts_prefix_candidates(base_key)
        gate_name, up_name, down_name = self._get_proj_names()

        expert_count = 0
        experts_prefix = None
        for prefix in experts_prefix_candidates:
            expert_count = 0
            while self.has_tensor(f"{prefix}.{expert_count}.{gate_name}.weight"):
                expert_count += 1
            if expert_count > 0:
                experts_prefix = prefix
                break

        if expert_count == 0 or experts_prefix is None:
            raise ValueError(f"No experts found for keys: {experts_prefix_candidates}")

        gate_weights = [None] * expert_count
        up_weights = [None] * expert_count
        down_weights = [None] * expert_count

        for exp_id in range(expert_count):
            gate_w_key = f"{experts_prefix}.{exp_id}.{gate_name}.weight"
            up_w_key = f"{experts_prefix}.{exp_id}.{up_name}.weight"
            down_w_key = f"{experts_prefix}.{exp_id}.{down_name}.weight"

            gate_weights[exp_id] = self.load_tensor(gate_w_key, device).contiguous()
            up_weights[exp_id] = self.load_tensor(up_w_key, device).contiguous()
            down_weights[exp_id] = self.load_tensor(down_w_key, device).contiguous()

        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
        }

    def _resolve_packed_experts_prefix(self, base_key: str) -> str:
        """Resolve the experts prefix for packed format, trying fallbacks."""
        # Direct: model.layers.{N}.mlp.experts
        experts_prefix = f"{base_key}.mlp.experts"
        if self.has_tensor(f"{experts_prefix}.gate_up_proj"):
            return experts_prefix

        # VL models: model.layers.{N} -> model.language_model.layers.{N}
        parts = base_key.split(".", 1)
        if len(parts) == 2:
            alt_base = f"{parts[0]}.language_model.{parts[1]}"
            experts_prefix = f"{alt_base}.mlp.experts"
            if self.has_tensor(f"{experts_prefix}.gate_up_proj"):
                return experts_prefix

        raise ValueError(f"No packed experts found for base_key '{base_key}'.")

    def _load_experts_packed(self, base_key: str, device: str = "cpu"):
        """Load packed expert weights (Qwen3.5 MoE style).

        Packed format stores all experts in stacked 3D tensors:
        - gate_up_proj: [num_experts, 2 * intermediate_size, hidden_size]
        - down_proj:    [num_experts, hidden_size, intermediate_size]
        """
        experts_prefix = self._resolve_packed_experts_prefix(base_key)

        gate_up_key = f"{experts_prefix}.gate_up_proj"
        down_key = f"{experts_prefix}.down_proj"

        gate_up = self.load_tensor(gate_up_key, device)  # [E, 2*I, H]
        down = self.load_tensor(down_key, device)  # [E, H, I]

        mid = gate_up.shape[1] // 2
        gate_list = [gate_up[i, :mid, :].contiguous() for i in range(gate_up.shape[0])]
        up_list = [gate_up[i, mid:, :].contiguous() for i in range(gate_up.shape[0])]
        down_list = [down[i].contiguous() for i in range(down.shape[0])]

        return {
            "gate": gate_list,
            "up": up_list,
            "down": down_list,
        }


class CompressedSafeTensorLoader(SafeTensorLoader):
    """Loader for compressed SafeTensor layouts (RAWINT4 weights)."""

    def load_experts(self, base_key: str, device: str = "cpu"):
        """Load raw expert weights stored in compressed safetensor format."""

        experts_prefix = f"{base_key}.mlp.experts"

        expert_idx = 0
        while self.has_tensor(f"{experts_prefix}.{expert_idx}.up_proj.weight_packed"):
            expert_idx += 1

        if expert_idx == 0:
            experts_prefix = f"language_model.{base_key}.mlp.experts"
            expert_idx = 0
            while self.has_tensor(f"{experts_prefix}.{expert_idx}.up_proj.weight_packed"):
                expert_idx += 1
            if expert_idx == 0:
                raise ValueError(f"No experts found for key {experts_prefix}")

        def load_projection(proj_name: str):
            weight_entries = []
            scale_entries = []

            for exp_id in range(expert_idx):
                weight_key = f"{experts_prefix}.{exp_id}.{proj_name}_proj.weight_packed"
                scale_key = f"{experts_prefix}.{exp_id}.{proj_name}_proj.weight_scale"

                if not self.has_tensor(weight_key):
                    raise KeyError(f"Missing tensor: {weight_key}")
                if not self.has_tensor(scale_key):
                    raise KeyError(f"Missing tensor: {scale_key}")

                weight_tensor = self.load_tensor(weight_key, device).contiguous()
                scale_tensor = self.load_tensor(scale_key, device).contiguous()

                weight_entries.append(weight_tensor)
                scale_entries.append(scale_tensor)

            return weight_entries, scale_entries

        gate_weights, gate_scales = load_projection("gate")
        up_weights, up_scales = load_projection("up")
        down_weights, down_scales = load_projection("down")

        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
            "gate_scale": gate_scales,
            "up_scale": up_scales,
            "down_scale": down_scales,
        }


class BF16SafeTensorLoader(SafeTensorLoader):
    """Loader for native BF16 expert weights (no quantization, no scales).

    Supported formats:
    - DeepSeek style: {base}.mlp.experts.{id}.{gate,up,down}_proj.weight
    - Mixtral/MiniMax style: {base}.block_sparse_moe.experts.{id}.{w1,w3,w2}.weight

    The format is auto-detected during initialization.
    """

    MOE_FORMATS = {
        "deepseek": ("{base}.mlp.experts", "gate_proj", "up_proj", "down_proj"),
        "mixtral": ("{base}.block_sparse_moe.experts", "w1", "w3", "w2"),
    }

    def __init__(self, file_path: str):
        super().__init__(file_path)
        self._detected_format = None
        self._detect_format()

    def _detect_format(self):
        """Auto-detect the MoE naming format by checking tensor keys."""
        sample_keys = list(self.tensor_file_map.keys())[:1000]

        for fmt_name, (path_tpl, gate, up, down) in self.MOE_FORMATS.items():
            for key in sample_keys:
                if ".experts." in key and f".{gate}.weight" in key:
                    if "block_sparse_moe.experts" in key and fmt_name == "mixtral":
                        self._detected_format = fmt_name
                        print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}")
                        return
                    elif "mlp.experts" in key and "block_sparse_moe" not in key and fmt_name == "deepseek":
                        self._detected_format = fmt_name
                        print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}")
                        return

        self._detected_format = "deepseek"
        print("[BF16SafeTensorLoader] No MoE format detected, defaulting to: deepseek")

    def _get_experts_prefix(self, base_key: str) -> str:
        """Get the experts prefix based on detected format."""
        path_tpl, _, _, _ = self.MOE_FORMATS[self._detected_format]
        return path_tpl.format(base=base_key)

    def _get_proj_names(self):
        """Get projection names (gate, up, down) based on detected format."""
        _, gate, up, down = self.MOE_FORMATS[self._detected_format]
        return gate, up, down

    def load_tensor(self, key: str, device: str = "cpu"):
        if key not in self.tensor_file_map:
            raise KeyError(f"Key {key} not found in Safetensor files")
        file = self.tensor_file_map[key]
        f = self.file_handle_map.get(file)
        if f is None:
            raise FileNotFoundError(f"File {file} not found in Safetensor files")
        tensor = f.get_tensor(key)
        if device == "cpu":
            return tensor
        return tensor.to(device)

    def load_experts(self, base_key: str, device: str = "cpu"):
        """Load BF16 expert weights (no scales needed).

        Args:
            base_key: Base key like "model.layers.{layer_index}"
            device: Target device for tensors

        Returns:
            Dictionary with keys: gate, up, down, gate_scale (None), up_scale (None), down_scale (None)
            gate/up/down: list of tensors [expert_id] -> tensor
        """
        experts_prefix = self._get_experts_prefix(base_key)
        gate_name, up_name, down_name = self._get_proj_names()

        expert_count = 0
        while self.has_tensor(f"{experts_prefix}.{expert_count}.{gate_name}.weight"):
            expert_count += 1

        if expert_count == 0:
            raise ValueError(f"No experts found for key {experts_prefix}")

        gate_weights = [None] * expert_count
        up_weights = [None] * expert_count
        down_weights = [None] * expert_count

        for exp_id in range(expert_count):
            gate_w_key = f"{experts_prefix}.{exp_id}.{gate_name}.weight"
            up_w_key = f"{experts_prefix}.{exp_id}.{up_name}.weight"
            down_w_key = f"{experts_prefix}.{exp_id}.{down_name}.weight"

            gate_weights[exp_id] = self.load_tensor(gate_w_key, device).contiguous()
            up_weights[exp_id] = self.load_tensor(up_w_key, device).contiguous()
            down_weights[exp_id] = self.load_tensor(down_w_key, device).contiguous()

        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
            "gate_scale": None,
            "up_scale": None,
            "down_scale": None,
        }


class GGUFLoader:
    """
    GGUF format loader using the official gguf library (gguf.gguf_reader.GGUFReader)

    This is a cleaner implementation compared to manual binary parsing.
    """

    def __init__(self, gguf_path: str):
        """
        Initialize GGUF loader from a file or directory

        Args:
            gguf_path: Path to a single GGUF file or a directory containing GGUF files
        """
        if not os.path.exists(gguf_path):
            raise FileNotFoundError(f"GGUF path not found: {gguf_path}")

        self.tensor_info = {}
        self.metadata = {}
        self.tensor_file_map = {}
        self.file_data_map = {}

        if os.path.isfile(gguf_path) and gguf_path.endswith(".gguf"):
            print(f"\n[GGUFLoader] Loading single GGUF file : {os.path.basename(gguf_path)}")
            self._load_single_file(gguf_path)
        elif os.path.isdir(gguf_path):
            print(f"\n[GGUFLoader] Loading GGUF files from directory: {gguf_path}")
            self._load_directory(gguf_path)
        else:
            raise ValueError(f"Path must be a .gguf file or a directory: {gguf_path}")

        print(f"[GGUFLoader] Summary:")
        print(f"  Files loaded: {len(self.file_data_map)}")
        print(f"  Total tensors: {len(self.tensor_info)}")
        print(f"  Metadata keys: {len(self.metadata)}")
        tensors = ["blk.0.ffn_up_exps.weight", "blk.0.ffn_gate_exps.weight", "blk.0.ffn_down_exps.weight"]
        for key in tensors:
            if key in self.tensor_info:
                info = self.tensor_info[key]
                print(f" {'.'.join(key.split('.')[2:-1])}, Dtype: {info['dtype'].name}")

    def _load_single_file(self, file_path: str):
        """Load a single GGUF file"""
        reader = GGUFReader(file_path)

        for key, field in reader.fields.items():
            value = field.parts[field.data[0]]
            if isinstance(value, bytes):
                value = value.decode("utf-8")
            elif isinstance(value, np.ndarray) and value.dtype == np.uint8:
                try:
                    value = bytes(value).decode("utf-8")
                except:
                    pass
            self.metadata[key] = value

        for tensor in reader.tensors:
            self.tensor_info[tensor.name] = {
                "shape": list(reversed(tensor.shape)),  # Reverse to match PyTorch order
                "dtype": tensor.tensor_type,
                "offset": tensor.data_offset,
                "n_elements": tensor.n_elements,
            }
            self.tensor_file_map[tensor.name] = file_path

        self.file_data_map[file_path] = np.memmap(file_path, mode="r")

    def _load_directory(self, dir_path: str):
        """Load all GGUF files from a directory (non-recursive)"""
        found_gguf = False

        for file in sorted(os.listdir(dir_path)):
            if file.endswith(".gguf"):
                found_gguf = True
                file_path = os.path.join(dir_path, file)
                print(f"  Loading: {file}")

                reader = GGUFReader(file_path)

                for key, field in reader.fields.items():
                    value = field.parts[field.data[0]]
                    if isinstance(value, bytes):
                        value = value.decode("utf-8")
                    elif isinstance(value, np.ndarray) and value.dtype == np.uint8:
                        try:
                            value = bytes(value).decode("utf-8")
                        except:
                            pass
                    self.metadata[key] = value

                for tensor in reader.tensors:
                    self.tensor_info[tensor.name] = {
                        "shape": list(reversed(tensor.shape)),
                        "dtype": tensor.tensor_type,
                        "offset": tensor.data_offset,
                        "n_elements": tensor.n_elements,
                    }
                    self.tensor_file_map[tensor.name] = file_path

                self.file_data_map[file_path] = np.memmap(file_path, mode="r")

        if not found_gguf:
            raise FileNotFoundError(f"No .gguf files found in directory: {dir_path}")

    def get_model_config(self, layer_idx: int = 0):
        """
        Extract model configuration from GGUF metadata and tensor shapes.

        Args:
            layer_idx: Layer index to inspect (default: 0)

        Returns:
            dict with keys: num_experts, num_experts_per_tok, hidden_size, moe_intermediate_size
        """
        config = {}

        arch = self.metadata.get("general.architecture", "unknown")

        num_experts = None
        for key_suffix in [
            "expert_count",
            "expert.count",
            "moe.expert_count",
            "expert_feed_forward_length",
        ]:
            key = f"{arch}.{key_suffix}"
            if key in self.metadata:
                val = self.metadata[key]
                num_experts = int(val[0]) if isinstance(val, (list, np.ndarray)) else int(val)
                break

        num_experts_per_tok = None
        for key_suffix in [
            "expert_used_count",
            "expert.used_count",
            "moe.num_experts_per_tok",
        ]:
            key = f"{arch}.{key_suffix}"
            if key in self.metadata:
                val = self.metadata[key]
                num_experts_per_tok = int(val[0]) if isinstance(val, (list, np.ndarray)) else int(val)
                break

        hidden_size = None
        for key_suffix in [
            "embedding_length",
            "embed_length",
            "hidden_size",
        ]:
            key = f"{arch}.{key_suffix}"
            if key in self.metadata:
                val = self.metadata[key]
                hidden_size = int(val[0]) if isinstance(val, (list, np.ndarray)) else int(val)
                break

        moe_intermediate_size = None
        for key_suffix in [
            "expert_feed_forward_length",
            "feed_forward_length",
            "ffn_length",
            "intermediate_size",
        ]:
            key = f"{arch}.{key_suffix}"
            if key in self.metadata:
                val = self.metadata[key]
                moe_intermediate_size = int(val[0]) if isinstance(val, (list, np.ndarray)) else int(val)
                break

        if any(v is None for v in [num_experts, hidden_size, moe_intermediate_size]):

            base_key = f"blk.{layer_idx}.ffn_gate_exps.weight"
            if base_key in self.tensor_info:
                gate_shape = self.tensor_info[base_key]["shape"]
                print(f"  Found tensor '{base_key}' with shape: {gate_shape}")

                if len(gate_shape) >= 3:
                    if num_experts is None:
                        num_experts = int(gate_shape[0])
                    if moe_intermediate_size is None:
                        moe_intermediate_size = int(gate_shape[1])
                    if hidden_size is None:
                        hidden_size = int(gate_shape[2])

        config = {
            "num_experts": num_experts,
            "num_experts_per_tok": num_experts_per_tok,
            "hidden_size": hidden_size,
            "moe_intermediate_size": moe_intermediate_size,
        }

        return config

    def print_metadata(self, filter_keywords=None):
        """
        Print GGUF file metadata for debugging.

        Args:
            filter_keywords: Optional list of keywords to filter metadata keys
        """
        print(f"\n[GGUFLoader] GGUF Metadata:")
        print(f"  Total metadata entries: {len(self.metadata)}")

        if filter_keywords:
            filtered = {
                k: v for k, v in self.metadata.items() if any(kw.lower() in k.lower() for kw in filter_keywords)
            }
            for k, v in sorted(filtered.items()):
                print(f"  {k}: {v}")
        else:
            for k, v in sorted(self.metadata.items()):
                print(f"  {k}: {v}")

    def has_tensor(self, name: str):
        """Check if tensor exists"""
        name = translate_name_to_gguf(name)
        return name in self.tensor_info

    def get_ggml_type(self, name: str):
        """Get GGML type of a tensor"""
        name = translate_name_to_gguf(name)
        if name not in self.tensor_info:
            raise KeyError(f"Tensor '{name}' not found in GGUF files")
        return self.tensor_info[name]["dtype"]

    def get_undequanted_tensor_and_ggml_type(self, name: str):
        """
        Get tensor data and its GGML type without dequantizing

        Args:
            name: Tensor name (in PyTorch format, will be translated to GGUF format)

        Returns:
            (data, ggml_type): Tuple of tensor data and GGML quantization type
        """
        name = translate_name_to_gguf(name)

        if name not in self.tensor_info:
            raise KeyError(f"Tensor '{name}' not found in GGUF files")

        info = self.tensor_info[name]
        file_path = self.tensor_file_map[name]
        mmap_data = self.file_data_map[file_path]

        offset = info["offset"]
        n_elements = info["n_elements"]
        ggml_type = info["dtype"]

        GGML_QUANT_SIZES = {
            GGMLQuantizationType.F32: (1, 4),
            GGMLQuantizationType.F16: (1, 2),
            GGMLQuantizationType.BF16: (1, 2),
            GGMLQuantizationType.Q4_0: (32, 2 + 16),
            GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
            GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
            GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
            GGMLQuantizationType.Q8_0: (32, 2 + 32),
            GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
            GGMLQuantizationType.Q2_K: (256, 2 + 2 + 256 // 16 + 256 // 4),
            GGMLQuantizationType.Q3_K: (256, 2 + 256 // 4 + 256 // 8 + 12),
            GGMLQuantizationType.Q4_K: (256, 2 + 2 + 256 // 2 + 12),
            GGMLQuantizationType.Q5_K: (256, 2 + 2 + 256 // 2 + 256 // 8 + 12),
            GGMLQuantizationType.Q6_K: (256, 2 + 256 // 2 + 256 // 4 + 256 // 16),
            GGMLQuantizationType.Q8_K: (256, 4 + 256 + 256 // 8),
            GGMLQuantizationType.IQ2_XXS: (256, 2 + 256 // 4),
            GGMLQuantizationType.IQ2_XS: (256, 2 + 256 // 4 + 256 // 32),
            GGMLQuantizationType.IQ3_XXS: (256, 2 + 256 // 4 + 256 // 8),
            GGMLQuantizationType.IQ1_S: (256, 2 + 256 // 8 + 256 // 16),
            GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
            GGMLQuantizationType.IQ3_S: (256, 2 + 256 // 4 + 256 // 8 + 256 // 32 + 4),
            GGMLQuantizationType.IQ2_S: (256, 2 + 256 // 4 + 256 // 16),
            GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + 256 // 2 + 256 // 64),
            GGMLQuantizationType.I8: (1, 1),
            GGMLQuantizationType.I16: (1, 2),
            GGMLQuantizationType.I32: (1, 4),
            GGMLQuantizationType.I64: (1, 8),
            GGMLQuantizationType.F64: (1, 8),
            GGMLQuantizationType.IQ1_M: (256, 256 // 8 + 256 // 16 + 256 // 32),
        }

        block_size, type_size = GGML_QUANT_SIZES[ggml_type]
        n_bytes = n_elements * type_size // block_size

        data_bytes = mmap_data[offset : offset + n_bytes]
        data = torch.from_numpy(np.frombuffer(data_bytes, dtype=np.uint8).copy())

        return data, ggml_type


class GPTQSafeTensorLoader(FP8SafeTensorLoader):
    """Loader for symmetric GPTQ-Int4 expert weights (qweight + scales, no qzeros).

    Only supports sym=true, desc_act=false GPTQ models.

    Tensor keys:
    - qweight: {prefix}.{id}.{proj}.qweight  (int32, packed 8x4-bit along K)
    - scales:  {prefix}.{id}.{proj}.scales    (fp16 -> converted to fp32)
    """

    def __init__(self, file_path: str):
        # Call FP8SafeTensorLoader init (which calls SafeTensorLoader init + format detection)
        super().__init__(file_path, scale_suffix="scales")
        # Verify GPTQ config
        self._verify_gptq_config(file_path)

    def _detect_format(self):
        """Override FP8 format detection to look for .qweight instead of .weight."""
        sample_keys = list(self.tensor_file_map.keys())[:2000]

        for fmt_name, (path_tpl, gate, up, down) in self.MOE_FORMATS.items():
            for key in sample_keys:
                if ".experts." in key and f".{gate}.qweight" in key:
                    if "block_sparse_moe.experts" in key and fmt_name == "mixtral":
                        self._detected_format = fmt_name
                        break
                    elif "mlp.experts" in key and "block_sparse_moe" not in key and fmt_name == "deepseek":
                        self._detected_format = fmt_name
                        # Check for VL model (language_model prefix)
                        if "language_model." in key:
                            self._is_vl_model = True
                        break
                    elif fmt_name == "mistral" and "block_sparse_moe" not in key and "mlp" not in key:
                        self._detected_format = fmt_name
                        break
            if self._detected_format is not None:
                break

        if self._detected_format is None:
            self._detected_format = "deepseek"

        vl_str = " (VL model)" if self._is_vl_model else ""
        print(f"[GPTQSafeTensorLoader] Detected format: {self._detected_format}{vl_str}")

    def _verify_gptq_config(self, file_path):
        """Check that the model uses sym=true, desc_act=false."""
        import json
        import os

        config_path = os.path.join(os.path.dirname(file_path), "config.json")
        if not os.path.exists(config_path):
            # Try parent directory
            config_path = os.path.join(file_path, "config.json")
        if os.path.exists(config_path):
            with open(config_path) as f:
                config = json.load(f)
            qc = config.get("quantization_config", {})
            if qc.get("quant_method") == "gptq":
                if qc.get("desc_act", False):
                    raise NotImplementedError(
                        "GPTQ desc_act=true is not supported. Only desc_act=false models are supported."
                    )
                if not qc.get("sym", True):
                    raise NotImplementedError(
                        "GPTQ sym=false (asymmetric) is not supported. Only sym=true models are supported."
                    )
                print(f"[GPTQSafeTensorLoader] Verified: sym={qc.get('sym')}, desc_act={qc.get('desc_act')}, "
                      f"bits={qc.get('bits')}, group_size={qc.get('group_size')}")

    def load_experts(self, base_key: str, device: str = "cpu"):
        """Load GPTQ expert qweight and scales.

        Returns dict with keys: gate, up, down (qweight int32), gate_scale, up_scale, down_scale (fp32).
        """
        experts_prefix_candidates = self._get_experts_prefix_candidates(base_key)
        gate_name, up_name, down_name = self._get_proj_names()

        expert_count = 0
        experts_prefix = None
        for prefix in experts_prefix_candidates:
            expert_count = 0
            while self.has_tensor(f"{prefix}.{expert_count}.{gate_name}.qweight"):
                expert_count += 1
            if expert_count > 0:
                experts_prefix = prefix
                break

        if expert_count == 0 or experts_prefix is None:
            raise ValueError(f"No GPTQ experts found for keys: {experts_prefix_candidates}")

        gate_weights = [None] * expert_count
        up_weights = [None] * expert_count
        down_weights = [None] * expert_count
        gate_scales = [None] * expert_count
        up_scales = [None] * expert_count
        down_scales = [None] * expert_count

        for exp_id in range(expert_count):
            gate_weights[exp_id] = self.load_tensor(f"{experts_prefix}.{exp_id}.{gate_name}.qweight", device).contiguous()
            up_weights[exp_id] = self.load_tensor(f"{experts_prefix}.{exp_id}.{up_name}.qweight", device).contiguous()
            down_weights[exp_id] = self.load_tensor(f"{experts_prefix}.{exp_id}.{down_name}.qweight", device).contiguous()

            gate_scales[exp_id] = self.load_tensor(f"{experts_prefix}.{exp_id}.{gate_name}.scales", device).float().contiguous()
            up_scales[exp_id] = self.load_tensor(f"{experts_prefix}.{exp_id}.{up_name}.scales", device).float().contiguous()
            down_scales[exp_id] = self.load_tensor(f"{experts_prefix}.{exp_id}.{down_name}.scales", device).float().contiguous()

        print(f"[GPTQSafeTensorLoader] Loaded {expert_count} experts from {experts_prefix}")
        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
            "gate_scale": gate_scales,
            "up_scale": up_scales,
            "down_scale": down_scales,
        }


class MXFP4SafeTensorLoader(SafeTensorLoader):
    """Loader for native MXFP4 expert weights (DeepSeek-V4-Flash format).

    Per expert layout:
      {base}.ffn.experts.{i}.w1.weight  I8       [N, K/2]   nibble-packed E2M1 (gate)
      {base}.ffn.experts.{i}.w1.scale   F8_E8M0  [N, K/32]  ue8m0 group scale
      {base}.ffn.experts.{i}.w3.{weight,scale}              up
      {base}.ffn.experts.{i}.w2.{weight,scale}              down

    V4 ckpt keys are not prefixed with ``model.``; we also probe the stripped form so
    callers can keep passing ``base_key="model.layers.{L}"``. ue8m0 → bf16 is a lossless
    bit shift (both have an 8-bit exponent and zero mantissa for ue8m0), and the AMX
    FP4 backend already consumes bf16 scales.
    """

    EXPERTS_PATH_TPL = "{base}.ffn.experts"
    PROJ_NAMES = ("w1", "w3", "w2")  # (gate, up, down)

    def _experts_prefix_candidates(self, base_key: str) -> list[str]:
        candidates = [self.EXPERTS_PATH_TPL.format(base=base_key)]
        if base_key.startswith("model."):
            candidates.append(self.EXPERTS_PATH_TPL.format(base=base_key[len("model.") :]))
        return list(dict.fromkeys(candidates))

    @staticmethod
    def _ue8m0_to_bf16(scale_t: torch.Tensor) -> torch.Tensor:
        if scale_t.dtype != torch.uint8:
            scale_t = scale_t.view(torch.uint8)
        # bf16 = [sign(1) | exp(8) | mant(7)]; setting mant=0, exp=e gives 2^(e-127),
        # which is exactly the value encoded by ue8m0 for e ∈ [1, 254]. e=0 → bf16 +0
        # (acceptable: ue8m0=0 represents 2^-127, below bf16 normal range), e=255 → +inf.
        # Compute in int32 then narrow to int16 (max value is 255<<7=32640, fits int16),
        # because torch CPU has no lshift kernel for uint16.
        return (scale_t.to(torch.int32) << 7).to(torch.int16).view(torch.bfloat16).contiguous()

    def load_experts(self, base_key: str, device: str = "cpu"):
        gate_name, up_name, down_name = self.PROJ_NAMES
        prefix = None
        expert_count = 0
        for cand in self._experts_prefix_candidates(base_key):
            expert_count = 0
            while self.has_tensor(f"{cand}.{expert_count}.{gate_name}.weight"):
                expert_count += 1
            if expert_count > 0:
                prefix = cand
                break
        if prefix is None:
            raise ValueError(
                f"No MXFP4 experts found under any of: {self._experts_prefix_candidates(base_key)}"
            )

        gate_weights = [None] * expert_count
        up_weights = [None] * expert_count
        down_weights = [None] * expert_count
        gate_scales = [None] * expert_count
        up_scales = [None] * expert_count
        down_scales = [None] * expert_count

        for exp_id in range(expert_count):
            for proj, dst in (
                (gate_name, gate_weights),
                (up_name, up_weights),
                (down_name, down_weights),
            ):
                w = self.load_tensor(f"{prefix}.{exp_id}.{proj}.weight", device).contiguous()
                if w.dtype != torch.uint8:
                    w = w.view(torch.uint8)
                dst[exp_id] = w

            for proj, dst in (
                (gate_name, gate_scales),
                (up_name, up_scales),
                (down_name, down_scales),
            ):
                s = self.load_tensor(f"{prefix}.{exp_id}.{proj}.scale", device)
                dst[exp_id] = self._ue8m0_to_bf16(s)

        print(f"[MXFP4SafeTensorLoader] Loaded {expert_count} experts from {prefix}")
        return {
            "gate": gate_weights,
            "up": up_weights,
            "down": down_weights,
            "gate_scale": gate_scales,
            "up_scale": up_scales,
            "down_scale": down_scales,
        }