From 20262b2743bd38e1f617fb5d3eadc8c0a919b33a Mon Sep 17 00:00:00 2001 From: "VYSE V.E.O" Date: Thu, 26 Feb 2026 15:47:22 +0800 Subject: [PATCH] Fix Qwen3.5 FP8 load for VL detection (#1857) * Fix Qwen3.5 FP8 load for VL detection 1, for VL models(Qwen3.5), modify base_key: model.layers.{N} -> model.language_model.layers.{N} 2, clean DUPLICATED class BF16SafeTensorLoader(SafeTensorLoader) , only the first overrided one. * Indent type Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- kt-kernel/python/utils/loader.py | 109 ++----------------------------- 1 file changed, 7 insertions(+), 102 deletions(-) diff --git a/kt-kernel/python/utils/loader.py b/kt-kernel/python/utils/loader.py index 68e53cc..250a99e 100644 --- a/kt-kernel/python/utils/loader.py +++ b/kt-kernel/python/utils/loader.py @@ -275,6 +275,7 @@ class FP8SafeTensorLoader(SafeTensorLoader): self._is_per_channel = False else: self._is_per_channel = False # Will be updated in _detect_format if auto-detect + self._is_vl_model = False self._detect_format() def _detect_format(self): @@ -313,6 +314,10 @@ class FP8SafeTensorLoader(SafeTensorLoader): self._scale_suffix = "weight_scale_inv" self._is_per_channel = False print("[FP8SafeTensorLoader] Detected scale format: block-wise (weight_scale_inv)") + if key.startswith("model.language_model.") and self._detected_format == "deepseek": + # VL models(Qwen3.5): model.layers.{N} -> model.language_model.layers.{N} + self._is_vl_model = True + print("[FP8SafeTensorLoader] Detected VL model") return elif f".{gate}.weight_scale" in key and "weight_scale_inv" not in key: self._scale_suffix = "weight_scale" @@ -331,6 +336,8 @@ class FP8SafeTensorLoader(SafeTensorLoader): def _get_experts_prefix(self, base_key: str) -> str: """Get the experts prefix based on detected format.""" path_tpl, _, _, _ = self.MOE_FORMATS[self._detected_format] + if self._is_vl_model: + base_key = base_key.replace("model.layers", "model.language_model.layers") return path_tpl.format(base=base_key) def _get_proj_names(self): @@ -416,108 +423,6 @@ class FP8SafeTensorLoader(SafeTensorLoader): return self._is_per_channel -class BF16SafeTensorLoader(SafeTensorLoader): - """Loader for native BF16 expert weights (no quantization, no scales). - - Supported formats: - - DeepSeek style: {base}.mlp.experts.{id}.{gate,up,down}_proj.weight - - Mixtral/MiniMax style: {base}.block_sparse_moe.experts.{id}.{w1,w3,w2}.weight - - The format is auto-detected during initialization. - """ - - MOE_FORMATS = { - "deepseek": ("{base}.mlp.experts", "gate_proj", "up_proj", "down_proj"), - "mixtral": ("{base}.block_sparse_moe.experts", "w1", "w3", "w2"), - } - - def __init__(self, file_path: str): - super().__init__(file_path) - self._detected_format = None - self._detect_format() - - def _detect_format(self): - """Auto-detect the MoE naming format by checking tensor keys.""" - sample_keys = list(self.tensor_file_map.keys())[:1000] - - # Check for packed format first (Qwen3.5 MoE style: all experts in one 3D tensor) - for key in sample_keys: - if key.endswith(".mlp.experts.gate_up_proj"): - self._detected_format = "packed" - print("[BF16SafeTensorLoader] Detected format: packed (Qwen3.5 MoE style)") - return - - for fmt_name, (path_tpl, gate, up, down) in self.MOE_FORMATS.items(): - for key in sample_keys: - if ".experts." in key and f".{gate}.weight" in key: - if "block_sparse_moe.experts" in key and fmt_name == "mixtral": - self._detected_format = fmt_name - print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}") - return - elif "mlp.experts" in key and "block_sparse_moe" not in key and fmt_name == "deepseek": - self._detected_format = fmt_name - print(f"[BF16SafeTensorLoader] Detected format: {fmt_name}") - return - - self._detected_format = "deepseek" - print("[BF16SafeTensorLoader] No MoE format detected, defaulting to: deepseek") - - def _get_experts_prefix(self, base_key: str) -> str: - """Get the experts prefix based on detected format.""" - path_tpl, _, _, _ = self.MOE_FORMATS[self._detected_format] - return path_tpl.format(base=base_key) - - def _get_proj_names(self): - """Get projection names (gate, up, down) based on detected format.""" - _, gate, up, down = self.MOE_FORMATS[self._detected_format] - return gate, up, down - - def load_tensor(self, key: str, device: str = "cpu"): - if key not in self.tensor_file_map: - raise KeyError(f"Key {key} not found in Safetensor files") - file = self.tensor_file_map[key] - f = self.file_handle_map.get(file) - if f is None: - raise FileNotFoundError(f"File {file} not found in Safetensor files") - tensor = f.get_tensor(key) - if device == "cpu": - return tensor - return tensor.to(device) - - def load_experts(self, base_key: str, device: str = "cpu"): - """Load BF16 expert weights (no scales needed).""" - if self._detected_format == "packed": - return self._load_experts_packed(base_key, device) - - experts_prefix = self._get_experts_prefix(base_key) - gate_name, up_name, down_name = self._get_proj_names() - - expert_count = 0 - while self.has_tensor(f"{experts_prefix}.{expert_count}.{gate_name}.weight"): - expert_count += 1 - - if expert_count == 0: - raise ValueError(f"No experts found for key {experts_prefix}") - - gate_weights = [None] * expert_count - up_weights = [None] * expert_count - down_weights = [None] * expert_count - - for exp_id in range(expert_count): - gate_w_key = f"{experts_prefix}.{exp_id}.{gate_name}.weight" - up_w_key = f"{experts_prefix}.{exp_id}.{up_name}.weight" - down_w_key = f"{experts_prefix}.{exp_id}.{down_name}.weight" - - gate_weights[exp_id] = self.load_tensor(gate_w_key, device).contiguous() - up_weights[exp_id] = self.load_tensor(up_w_key, device).contiguous() - down_weights[exp_id] = self.load_tensor(down_w_key, device).contiguous() - - return { - "gate": gate_weights, - "up": up_weights, - "down": down_weights, - } - class BF16SafeTensorLoader(SafeTensorLoader): """Loader for native BF16 expert weights (no quantization, no scales).