fix: qwen3-npu bugs; update: add readme-for-qwen3-npu (#1717)

* fix: qwen3-npu bugs; update: add readme-for-qwen3-npu

* fix: Correct the README description
This commit is contained in:
Shaoxu Cheng
2025-12-16 14:27:04 +08:00
committed by GitHub
parent 18fb8fc897
commit f25e58ad69
4 changed files with 120 additions and 224 deletions

View File

@@ -12,10 +12,6 @@ import torch.nn as nn
import transformers
from transformers import Cache, PretrainedConfig
from typing import List, Optional, Dict, Any, Tuple
try:
from ktransformers.server.balance_serve.settings import sched_ext
except:
print("no balance_serve")
try:
import torch_npu
@@ -26,6 +22,7 @@ try:
except:
use_torch_npu = False
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
from ktransformers.server.balance_serve.settings import sched_ext
class StaticCache(transformers.StaticCache):
"""
@@ -544,144 +541,6 @@ class KVC2Qwen3Cache(nn.Module):
self.k_caches = []
self.v_caches = []
# 环境变量控制日志/调试
self.debug_load = os.environ.get("KTRANS_DEBUG_KV_LOAD", "0") == "1"
self.debug_update = os.environ.get("KTRANS_DEBUG_KV_UPDATE", "0") == "1"
# ------------------------- 调试工具(按 env 开关) -------------------------
def _debug_dump_page_layout(
self,
page_idx: torch.Tensor, # [B, Q] 或 [N]
page_offset: torch.Tensor, # 同上
bsz: int,
q_len: int,
layer_idx: int,
):
# graph capture 时跳过
try:
if hasattr(torch.npu, "is_current_stream_capturing") and torch.npu.is_current_stream_capturing():
return
except Exception:
pass
page_size = self.page_size
pi = page_idx.detach().to("cpu", torch.long).reshape(bsz, q_len)
po = page_offset.detach().to("cpu", torch.long).reshape(bsz, q_len)
for b in range(bsz):
row_pi = pi[b] # [Q]
row_po = po[b] # [Q]
unique_pages = sorted(set(row_pi.tolist()))
print(f"[DEBUG-LAYOUT] layer={layer_idx}, batch={b}: pages used = {unique_pages}", flush=True)
for p in unique_pages:
mask = (row_pi == p)
if not mask.any():
continue
offsets = row_po[mask]
cnt = int(mask.sum())
min_off = int(offsets.min())
max_off = int(offsets.max())
print(
f" page {p}: count={cnt}, "
f"offset[min,max]=[{min_off}, {max_off}]",
flush=True,
)
# Q 比较小的时候,画一个 ASCII 条形图(仅看前 64 个位置)
if page_size <= 64 and q_len <= 64:
bar = ['.'] * page_size
for off in offsets.tolist():
if 0 <= off < page_size:
bar[off] = 'X'
print(f" layout: [{''.join(bar)}]", flush=True)
def _debug_check_page_mapping(
self,
page_idx: torch.Tensor,
page_offset: torch.Tensor,
bsz: int,
q_len: int,
layer_idx: int,
):
"""
看 (page_idx, page_offset) 展开后的 global_pos 是否合理。
global_pos = page_idx * page_size + page_offset
"""
page_size = self.page_size
pi = page_idx.to(torch.long).reshape(-1)
po = page_offset.to(torch.long).reshape(-1)
N = bsz * q_len
if pi.numel() != N:
print(
f"[DEBUG-PAGE][WARN] layer={layer_idx}: "
f"numel(page_idx)={pi.numel()} != bsz*q_len={N}",
flush=True,
)
return
gpos = pi * page_size + po # [N]
n_show = gpos.numel()
print(
f"[DEBUG-PAGE] layer={layer_idx}, first {n_show} global_pos="
f"{gpos[:n_show].tolist()}",
flush=True,
)
if gpos.numel() > 1:
diffs = gpos[1:] - gpos[:-1]
print(
f"[DEBUG-PAGE] layer={layer_idx}, "
f"global_pos diff min={int(diffs.min())}, max={int(diffs.max())}",
flush=True,
)
if not torch.all(diffs >= 0):
print(
f"[DEBUG-PAGE][WARN] layer={layer_idx}: "
f"global_pos not non-decreasing!",
flush=True,
)
def _debug_verify_k_roundtrip(
self,
flat_k: torch.Tensor, # [N, KvH, Dh]
layer_idx: int,
page_idx: torch.Tensor,
page_offset: torch.Tensor,
):
k_out = self.k_caches[layer_idx] # [num_pages, page_size, KvH, Dh]
pi = page_idx.to(torch.long).reshape(-1)
po = page_offset.to(torch.long).reshape(-1)
fetched = k_out[pi, po] # [N, KvH, Dh]
if fetched.shape != flat_k.shape:
print(
f"[DEBUG-KV][WARN] layer={layer_idx}: "
f"fetched.shape={tuple(fetched.shape)} != flat_k.shape={tuple(flat_k.shape)}",
flush=True,
)
return
diff = (fetched - flat_k).abs()
max_diff = diff.max().item()
mean_diff = diff.mean().item()
print(
f"[DEBUG-KV] layer={layer_idx}: K roundtrip max_abs_diff={max_diff}, "
f"mean_abs_diff={mean_diff}",
flush=True,
)
# ------------------------- 绑定到底层 kvc2 pool -------------------------
@@ -708,14 +567,6 @@ class KVC2Qwen3Cache(nn.Module):
self.k_caches.append(k_buf)
self.v_caches.append(v_buf)
if self.debug_load:
print(
f"[KV-CACHE-LOAD] layer={i}, "
f"k_cache shape={tuple(k_buf.shape)}, "
f"v_cache shape={tuple(v_buf.shape)}, dtype={k_buf.dtype}",
flush=True,
)
# num_pages * page_size
self.max_cache_len = self.k_caches[0].shape[0] * self.k_caches[0].shape[1]
@@ -740,22 +591,8 @@ class KVC2Qwen3Cache(nn.Module):
k_out = self.k_caches[layer_idx]
v_out = self.v_caches[layer_idx]
if self.debug_update:
print(
"[KV-UPDATE]",
f"layer={layer_idx}, key={tuple(key_states.shape)}, value={tuple(value_states.shape)}, "
f"page_idx shape={tuple(page_idx.shape)}, page_offset shape={tuple(page_offset.shape)}, "
f"k_out shape={tuple(k_out.shape)}, k_out.dtype={k_out.dtype}",
flush=True,
)
# -------- 1) 修正维度顺序:[B, KvH, Q, D] -> [B, Q, KvH, D] --------
if key_states.dim() == 4 and key_states.shape[1] == self.num_kv_heads:
if self.debug_update:
print(
"[KV-UPDATE] detected layout [B, KvH, Q, D], transpose -> [B, Q, KvH, D]",
flush=True,
)
key_states = key_states.transpose(1, 2).contiguous()
value_states = value_states.transpose(1, 2).contiguous()
@@ -773,13 +610,6 @@ class KVC2Qwen3Cache(nn.Module):
bsz, q_len, kv_heads, head_dim = key_states.shape
# if self.debug_update:
# print(
# "[KV-UPDATE] after layout fix:",
# f"bsz={bsz}, q_len={q_len}, kv_heads={kv_heads}, head_dim={head_dim}",
# flush=True,
# )
if kv_heads != self.num_kv_heads or head_dim != self.head_dim:
raise ValueError(
f"[KVC2Qwen3Cache] KV shape mismatch: "
@@ -787,29 +617,6 @@ class KVC2Qwen3Cache(nn.Module):
f"expected num_kv_heads={self.num_kv_heads}, head_dim={self.head_dim}"
)
# ================== DEBUG检查 page 映射 ==================
if os.environ.get("KTRANS_DEBUG_PAGE", "0") == "1":
try:
if not torch.npu.is_current_stream_capturing():
self._debug_check_page_mapping(
page_idx,
page_offset,
bsz=bsz,
q_len=q_len,
layer_idx=layer_idx,
)
except Exception:
pass
if os.environ.get("KTRANS_DEBUG_LAYOUT", "0") == "1":
self._debug_dump_page_layout(
page_idx,
page_offset,
bsz=bsz,
q_len=q_len,
layer_idx=layer_idx,
)
# -------- 2) flatten page_idx / page_offset 为一维 --------
page_idx = page_idx.reshape(-1)
page_offset = page_offset.reshape(-1)
@@ -819,40 +626,11 @@ class KVC2Qwen3Cache(nn.Module):
flat_k = key_states.to(val_dtype).reshape(-1, kv_heads, head_dim)
flat_v = value_states.to(val_dtype).reshape(-1, kv_heads, head_dim)
# if self.debug_update:
# print(
# "[KV-UPDATE] flat_k.shape=",
# tuple(flat_k.shape),
# " flat_v.shape=",
# tuple(flat_v.shape),
# " flat_k.dtype=",
# flat_k.dtype,
# flush=True,
# )
# -------- 4) 真正写入 K / V --------
# k_out / v_out: [num_pages, page_size, num_kv_heads, head_dim]
k_out[page_idx, page_offset] = flat_k
v_out[page_idx, page_offset] = flat_v
# if self.debug_update:
# print(f"[KV-UPDATE] write done for layer {layer_idx}", flush=True)
# ================== DEBUG写入后从 cache 读出来对比 ==================
if os.environ.get("KTRANS_DEBUG_KV", "0") == "1":
try:
if not torch.npu.is_current_stream_capturing():
self._debug_verify_k_roundtrip(
flat_k=flat_k,
layer_idx=layer_idx,
page_idx=page_idx,
page_offset=page_offset,
)
except Exception:
pass
return k_out, v_out
# ------------------------- get K/V -------------------------
def get_k_cache(self, layer_idx):
return self.k_caches[layer_idx]

View File

@@ -667,7 +667,7 @@ def translate_name_to_gguf(name):
name = translate_name_to_gguf_mixtral(name)
if ".ffn_gate_exp." in name:
name = name.replace(".ffn_gate_exp.", ".ffn_gate_exps.")
name = name.replace(".ffn_gate_exp.", ".ffn_gate_exps.")
if ".ffn_up_exp." in name:
name = name.replace(".ffn_up_exp.", ".ffn_up_exps.")
if ".ffn_down_exp." in name:

View File

@@ -0,0 +1,118 @@
# 基准测试结果(输出token长度均设置1k, 单并发)
| Prompt length | 1K | 2K | 4K |
| --------------------------------- | ------ | ------ | ------ |
| KTrans Prefill token/s | 134.11 | 141.60 | 143.42 |
| KTrans Decode token/s | 11.05 | 10.74 | 10.68 |
## 先决条件
我们在以下配置下进行了Qwen3-235B-A22B MoE最佳性能测试
- 服务器型号Atlas 2UP
- NPUAtlas 300I A2
- CPU: HUAWEI Kunpeng 920 7270Z
- 内存: DDR5服务器内存1TB
# 部署
***关于部署过程此README中只额外描述与同级目录下 `DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md` 不同的部分***
## 物理机安装
部署满血版Qwen3-MoE需要机器物理内存能够存放下全部路由专家的权重约200GB。
目前支持的NPU型号**300I A2**。
在技术人员的支持下完成硬件安装。
## 权重准备
目前,为了满足性能和精度的要求,我们需要准备两份权重,并使用提供的权重合并脚本对权重进行合并,最终只会使用合并后的权重。
Q4权重[Qwen3-235B-A22B-Instruct-2507-GGUF](https://modelscope.cn/models/unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF/files)
W8A8权重[Qwen3-235B-A22B-w8a8](https://modelers.cn/models/Modelers_Park/Qwen3-235B-A22B-w8a8)
使用[merge_safetensor_gguf_for_qwen3.py](../../merge_tensors/merge_safetensor_gguf_for_qwen3.py)来合并Q4和W8A8权重
```bash
python merge_safetensor_gguf_for_qwen3.py --safetensor_path /mnt/weights/Qwen3-235B-A22B-Q4_K_M --gguf_path /mnt/weights/Qwen3-235B-A22B-W8A8 --output_path /mnt/weights/Qwen3-235B-A22B-q4km-w8a8
```
## kTransformers部署
将项目文件部署到机器上:
- 初始化third_party。由于此过程耗时较多且容易受网络影响导致仓库克隆失败建议初始化一次后将相关文件进行打包以便后续直接解压使用。
```bash
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule update --init --recursive
```
- 对于arm平台注释掉`./third_party/llamafile/iqk_mul_mat_arm82.cpp`中的
```cpp
#define iqk_mul_mat iqk_mul_mat_arm82
#define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
```
- 执行`source /usr/local/Ascend/ascend-toolkit/set_env.sh`以实际CANN-TOOLKIT安装路径为准
- 执行`apt install cmake libhwloc-dev pkg-config`安装依赖。
- 修改项目目录下 /ktransformers/config/config.yaml 中attn部分的page_size: 128 chunk_size: 16384
- 执行`USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh`,等待安装完成。
***执行安装命令之前,需要将`./ktransformers/configs/config.yaml`中对于page size的设置改为page size=128(因为attn计算算子`torch_npu.npu_fused_infer_attention_score`支持page_size=16/128)***
此处给出示例balance_serve的启动脚本由于使用了相对路径需将该脚本放至项目的根路径下
```bash
#!/bin/bash
export USE_MERGE=0
export INF_NAN_MODE_FORCE_DISABLE=1
export TASK_QUEUE_ENABLE=0
export RANK=0
export LOCAL_WORLD_SIZE=1
#export PROF_DECODE=1
#export PROF_PREFILL=1
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
python ktransformers/server/main.py \
--port 10002 \
--model_path <your model path> \
--gguf_path <your model path> \
--cpu_infer 48 \
--optimize_config_path ./ktransformers/optimize/optimize_rules/npu/Qwen3-Chat-300IA2-npu-serve.yaml \
--max_new_tokens 1024 \
--cache_lens 16384 \
--max_batch_size 4 \
--use_cuda_graph \
--tp 1 \
--backend_type balance_serve
```
相关参数说明:
- `--model_path`kTransformers原生参数str此处用来指定合并后的模型文件路径
- `--gguf_path`kTransformers原生参数str此处用来指定合并后的模型文件路径
- `--cpu_infer`kTransformers原生参数int用来控制CPU侧实际worker线程数非必选
- `--optimize_config_path`kTransformers原生参数str用来指定所用的模型优化配置文件需要注意相对路径的使用此处为**必选**
- `--cache_lens`:调度器申请 kvcache 的总长度。所有请求共享指定数量(例如 `20480`)的 tokens 对应的 kvcache 空间,请求完成后会释放其所占用的 kvcache 空间,非必选
- `--use_cuda_graph`kTransformers原生参数bool为True表示开启图下沉为False表示关闭图下沉非必选
- `--max_new_tokens`kTransformers原生参数int当统计到输出的tokens数量达到该值时会直接中止输出非必选
- `--tp`新增参数int用于开启tensor model parallel功能目前local_chat只支持tp大小与ws大小相同不支持local_chat使用多dp非必选
# 其他问题
## 可能存在的其他依赖问题
ImportError: libhccl.so: cannot open shared object file: No such file or directory
```bash
source /usr/local/Ascend/ascend-toolkit/set_env.sh # 以实际CANN安装路径为准
```
ImportError: libascend_hal.so: cannot open shared object file: No such file or directory
```bash
export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH # 以实际Driver安装路径为准
```