mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-03-14 18:37:23 +00:00
fix: qwen3-npu bugs; update: add readme-for-qwen3-npu (#1717)
* fix: qwen3-npu bugs; update: add readme-for-qwen3-npu * fix: Correct the README description
This commit is contained in:
@@ -12,10 +12,6 @@ import torch.nn as nn
|
||||
import transformers
|
||||
from transformers import Cache, PretrainedConfig
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
try:
|
||||
from ktransformers.server.balance_serve.settings import sched_ext
|
||||
except:
|
||||
print("no balance_serve")
|
||||
|
||||
try:
|
||||
import torch_npu
|
||||
@@ -26,6 +22,7 @@ try:
|
||||
except:
|
||||
use_torch_npu = False
|
||||
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
|
||||
from ktransformers.server.balance_serve.settings import sched_ext
|
||||
|
||||
class StaticCache(transformers.StaticCache):
|
||||
"""
|
||||
@@ -544,144 +541,6 @@ class KVC2Qwen3Cache(nn.Module):
|
||||
self.k_caches = []
|
||||
self.v_caches = []
|
||||
|
||||
# 环境变量控制日志/调试
|
||||
self.debug_load = os.environ.get("KTRANS_DEBUG_KV_LOAD", "0") == "1"
|
||||
self.debug_update = os.environ.get("KTRANS_DEBUG_KV_UPDATE", "0") == "1"
|
||||
|
||||
# ------------------------- 调试工具(按 env 开关) -------------------------
|
||||
|
||||
def _debug_dump_page_layout(
|
||||
self,
|
||||
page_idx: torch.Tensor, # [B, Q] 或 [N]
|
||||
page_offset: torch.Tensor, # 同上
|
||||
bsz: int,
|
||||
q_len: int,
|
||||
layer_idx: int,
|
||||
):
|
||||
# graph capture 时跳过
|
||||
try:
|
||||
if hasattr(torch.npu, "is_current_stream_capturing") and torch.npu.is_current_stream_capturing():
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
page_size = self.page_size
|
||||
|
||||
pi = page_idx.detach().to("cpu", torch.long).reshape(bsz, q_len)
|
||||
po = page_offset.detach().to("cpu", torch.long).reshape(bsz, q_len)
|
||||
|
||||
for b in range(bsz):
|
||||
row_pi = pi[b] # [Q]
|
||||
row_po = po[b] # [Q]
|
||||
|
||||
unique_pages = sorted(set(row_pi.tolist()))
|
||||
print(f"[DEBUG-LAYOUT] layer={layer_idx}, batch={b}: pages used = {unique_pages}", flush=True)
|
||||
|
||||
for p in unique_pages:
|
||||
mask = (row_pi == p)
|
||||
if not mask.any():
|
||||
continue
|
||||
offsets = row_po[mask]
|
||||
|
||||
cnt = int(mask.sum())
|
||||
min_off = int(offsets.min())
|
||||
max_off = int(offsets.max())
|
||||
|
||||
print(
|
||||
f" page {p}: count={cnt}, "
|
||||
f"offset[min,max]=[{min_off}, {max_off}]",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# Q 比较小的时候,画一个 ASCII 条形图(仅看前 64 个位置)
|
||||
if page_size <= 64 and q_len <= 64:
|
||||
bar = ['.'] * page_size
|
||||
for off in offsets.tolist():
|
||||
if 0 <= off < page_size:
|
||||
bar[off] = 'X'
|
||||
print(f" layout: [{''.join(bar)}]", flush=True)
|
||||
|
||||
def _debug_check_page_mapping(
|
||||
self,
|
||||
page_idx: torch.Tensor,
|
||||
page_offset: torch.Tensor,
|
||||
bsz: int,
|
||||
q_len: int,
|
||||
layer_idx: int,
|
||||
):
|
||||
"""
|
||||
看 (page_idx, page_offset) 展开后的 global_pos 是否合理。
|
||||
global_pos = page_idx * page_size + page_offset
|
||||
"""
|
||||
page_size = self.page_size
|
||||
|
||||
pi = page_idx.to(torch.long).reshape(-1)
|
||||
po = page_offset.to(torch.long).reshape(-1)
|
||||
N = bsz * q_len
|
||||
|
||||
if pi.numel() != N:
|
||||
print(
|
||||
f"[DEBUG-PAGE][WARN] layer={layer_idx}: "
|
||||
f"numel(page_idx)={pi.numel()} != bsz*q_len={N}",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
gpos = pi * page_size + po # [N]
|
||||
|
||||
n_show = gpos.numel()
|
||||
print(
|
||||
f"[DEBUG-PAGE] layer={layer_idx}, first {n_show} global_pos="
|
||||
f"{gpos[:n_show].tolist()}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
if gpos.numel() > 1:
|
||||
diffs = gpos[1:] - gpos[:-1]
|
||||
print(
|
||||
f"[DEBUG-PAGE] layer={layer_idx}, "
|
||||
f"global_pos diff min={int(diffs.min())}, max={int(diffs.max())}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
if not torch.all(diffs >= 0):
|
||||
print(
|
||||
f"[DEBUG-PAGE][WARN] layer={layer_idx}: "
|
||||
f"global_pos not non-decreasing!",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
def _debug_verify_k_roundtrip(
|
||||
self,
|
||||
flat_k: torch.Tensor, # [N, KvH, Dh]
|
||||
layer_idx: int,
|
||||
page_idx: torch.Tensor,
|
||||
page_offset: torch.Tensor,
|
||||
):
|
||||
k_out = self.k_caches[layer_idx] # [num_pages, page_size, KvH, Dh]
|
||||
|
||||
pi = page_idx.to(torch.long).reshape(-1)
|
||||
po = page_offset.to(torch.long).reshape(-1)
|
||||
|
||||
fetched = k_out[pi, po] # [N, KvH, Dh]
|
||||
|
||||
if fetched.shape != flat_k.shape:
|
||||
print(
|
||||
f"[DEBUG-KV][WARN] layer={layer_idx}: "
|
||||
f"fetched.shape={tuple(fetched.shape)} != flat_k.shape={tuple(flat_k.shape)}",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
diff = (fetched - flat_k).abs()
|
||||
max_diff = diff.max().item()
|
||||
mean_diff = diff.mean().item()
|
||||
|
||||
print(
|
||||
f"[DEBUG-KV] layer={layer_idx}: K roundtrip max_abs_diff={max_diff}, "
|
||||
f"mean_abs_diff={mean_diff}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# ------------------------- 绑定到底层 kvc2 pool -------------------------
|
||||
|
||||
@@ -708,14 +567,6 @@ class KVC2Qwen3Cache(nn.Module):
|
||||
self.k_caches.append(k_buf)
|
||||
self.v_caches.append(v_buf)
|
||||
|
||||
if self.debug_load:
|
||||
print(
|
||||
f"[KV-CACHE-LOAD] layer={i}, "
|
||||
f"k_cache shape={tuple(k_buf.shape)}, "
|
||||
f"v_cache shape={tuple(v_buf.shape)}, dtype={k_buf.dtype}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# num_pages * page_size
|
||||
self.max_cache_len = self.k_caches[0].shape[0] * self.k_caches[0].shape[1]
|
||||
|
||||
@@ -740,22 +591,8 @@ class KVC2Qwen3Cache(nn.Module):
|
||||
k_out = self.k_caches[layer_idx]
|
||||
v_out = self.v_caches[layer_idx]
|
||||
|
||||
if self.debug_update:
|
||||
print(
|
||||
"[KV-UPDATE]",
|
||||
f"layer={layer_idx}, key={tuple(key_states.shape)}, value={tuple(value_states.shape)}, "
|
||||
f"page_idx shape={tuple(page_idx.shape)}, page_offset shape={tuple(page_offset.shape)}, "
|
||||
f"k_out shape={tuple(k_out.shape)}, k_out.dtype={k_out.dtype}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# -------- 1) 修正维度顺序:[B, KvH, Q, D] -> [B, Q, KvH, D] --------
|
||||
if key_states.dim() == 4 and key_states.shape[1] == self.num_kv_heads:
|
||||
if self.debug_update:
|
||||
print(
|
||||
"[KV-UPDATE] detected layout [B, KvH, Q, D], transpose -> [B, Q, KvH, D]",
|
||||
flush=True,
|
||||
)
|
||||
key_states = key_states.transpose(1, 2).contiguous()
|
||||
value_states = value_states.transpose(1, 2).contiguous()
|
||||
|
||||
@@ -773,13 +610,6 @@ class KVC2Qwen3Cache(nn.Module):
|
||||
|
||||
bsz, q_len, kv_heads, head_dim = key_states.shape
|
||||
|
||||
# if self.debug_update:
|
||||
# print(
|
||||
# "[KV-UPDATE] after layout fix:",
|
||||
# f"bsz={bsz}, q_len={q_len}, kv_heads={kv_heads}, head_dim={head_dim}",
|
||||
# flush=True,
|
||||
# )
|
||||
|
||||
if kv_heads != self.num_kv_heads or head_dim != self.head_dim:
|
||||
raise ValueError(
|
||||
f"[KVC2Qwen3Cache] KV shape mismatch: "
|
||||
@@ -787,29 +617,6 @@ class KVC2Qwen3Cache(nn.Module):
|
||||
f"expected num_kv_heads={self.num_kv_heads}, head_dim={self.head_dim}"
|
||||
)
|
||||
|
||||
# ================== DEBUG:检查 page 映射 ==================
|
||||
if os.environ.get("KTRANS_DEBUG_PAGE", "0") == "1":
|
||||
try:
|
||||
if not torch.npu.is_current_stream_capturing():
|
||||
self._debug_check_page_mapping(
|
||||
page_idx,
|
||||
page_offset,
|
||||
bsz=bsz,
|
||||
q_len=q_len,
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if os.environ.get("KTRANS_DEBUG_LAYOUT", "0") == "1":
|
||||
self._debug_dump_page_layout(
|
||||
page_idx,
|
||||
page_offset,
|
||||
bsz=bsz,
|
||||
q_len=q_len,
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
|
||||
# -------- 2) flatten page_idx / page_offset 为一维 --------
|
||||
page_idx = page_idx.reshape(-1)
|
||||
page_offset = page_offset.reshape(-1)
|
||||
@@ -819,40 +626,11 @@ class KVC2Qwen3Cache(nn.Module):
|
||||
flat_k = key_states.to(val_dtype).reshape(-1, kv_heads, head_dim)
|
||||
flat_v = value_states.to(val_dtype).reshape(-1, kv_heads, head_dim)
|
||||
|
||||
# if self.debug_update:
|
||||
# print(
|
||||
# "[KV-UPDATE] flat_k.shape=",
|
||||
# tuple(flat_k.shape),
|
||||
# " flat_v.shape=",
|
||||
# tuple(flat_v.shape),
|
||||
# " flat_k.dtype=",
|
||||
# flat_k.dtype,
|
||||
# flush=True,
|
||||
# )
|
||||
|
||||
# -------- 4) 真正写入 K / V --------
|
||||
# k_out / v_out: [num_pages, page_size, num_kv_heads, head_dim]
|
||||
k_out[page_idx, page_offset] = flat_k
|
||||
v_out[page_idx, page_offset] = flat_v
|
||||
|
||||
# if self.debug_update:
|
||||
# print(f"[KV-UPDATE] write done for layer {layer_idx}", flush=True)
|
||||
|
||||
# ================== DEBUG:写入后从 cache 读出来对比 ==================
|
||||
if os.environ.get("KTRANS_DEBUG_KV", "0") == "1":
|
||||
try:
|
||||
if not torch.npu.is_current_stream_capturing():
|
||||
self._debug_verify_k_roundtrip(
|
||||
flat_k=flat_k,
|
||||
layer_idx=layer_idx,
|
||||
page_idx=page_idx,
|
||||
page_offset=page_offset,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return k_out, v_out
|
||||
|
||||
# ------------------------- get K/V -------------------------
|
||||
def get_k_cache(self, layer_idx):
|
||||
return self.k_caches[layer_idx]
|
||||
|
||||
@@ -667,7 +667,7 @@ def translate_name_to_gguf(name):
|
||||
name = translate_name_to_gguf_mixtral(name)
|
||||
|
||||
if ".ffn_gate_exp." in name:
|
||||
name = name.replace(".ffn_gate_exp.", ".ffn_gate_exps.")
|
||||
name = name.replace(".ffn_gate_exp.", ".ffn_gate_exps.")
|
||||
if ".ffn_up_exp." in name:
|
||||
name = name.replace(".ffn_up_exp.", ".ffn_up_exps.")
|
||||
if ".ffn_down_exp." in name:
|
||||
|
||||
118
doc/zh/Qwen3-MoE_tutorial_zh_for_Ascend_NPU.md
Normal file
118
doc/zh/Qwen3-MoE_tutorial_zh_for_Ascend_NPU.md
Normal file
@@ -0,0 +1,118 @@
|
||||
# 基准测试结果(输出token长度均设置1k, 单并发)
|
||||
|
||||
| Prompt length | 1K | 2K | 4K |
|
||||
| --------------------------------- | ------ | ------ | ------ |
|
||||
| KTrans Prefill token/s | 134.11 | 141.60 | 143.42 |
|
||||
| KTrans Decode token/s | 11.05 | 10.74 | 10.68 |
|
||||
|
||||
## 先决条件
|
||||
我们在以下配置下进行了Qwen3-235B-A22B MoE最佳性能测试:
|
||||
- 服务器型号:Atlas 2UP
|
||||
- NPU:Atlas 300I A2
|
||||
- CPU: HUAWEI Kunpeng 920 7270Z
|
||||
- 内存: DDR5服务器内存(1TB)
|
||||
|
||||
# 部署
|
||||
|
||||
***关于部署过程,此README中只额外描述与同级目录下 `DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md` 不同的部分***
|
||||
|
||||
## 物理机安装
|
||||
|
||||
部署满血版Qwen3-MoE,需要机器物理内存能够存放下全部路由专家的权重,约200GB。
|
||||
|
||||
目前支持的NPU型号:**300I A2**。
|
||||
|
||||
在技术人员的支持下完成硬件安装。
|
||||
|
||||
|
||||
## 权重准备
|
||||
|
||||
目前,为了满足性能和精度的要求,我们需要准备两份权重,并使用提供的权重合并脚本对权重进行合并,最终只会使用合并后的权重。
|
||||
|
||||
Q4权重:[Qwen3-235B-A22B-Instruct-2507-GGUF](https://modelscope.cn/models/unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF/files)
|
||||
|
||||
W8A8权重:[Qwen3-235B-A22B-w8a8](https://modelers.cn/models/Modelers_Park/Qwen3-235B-A22B-w8a8)
|
||||
|
||||
使用[merge_safetensor_gguf_for_qwen3.py](../../merge_tensors/merge_safetensor_gguf_for_qwen3.py)来合并Q4和W8A8权重:
|
||||
|
||||
```bash
|
||||
python merge_safetensor_gguf_for_qwen3.py --safetensor_path /mnt/weights/Qwen3-235B-A22B-Q4_K_M --gguf_path /mnt/weights/Qwen3-235B-A22B-W8A8 --output_path /mnt/weights/Qwen3-235B-A22B-q4km-w8a8
|
||||
```
|
||||
|
||||
## kTransformers部署
|
||||
|
||||
将项目文件部署到机器上:
|
||||
|
||||
- 初始化third_party。由于此过程耗时较多,且容易受网络影响导致仓库克隆失败,建议初始化一次后,将相关文件进行打包,以便后续直接解压使用。
|
||||
```bash
|
||||
git clone https://github.com/kvcache-ai/ktransformers.git
|
||||
cd ktransformers
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
- 对于arm平台,注释掉`./third_party/llamafile/iqk_mul_mat_arm82.cpp`中的
|
||||
```cpp
|
||||
#define iqk_mul_mat iqk_mul_mat_arm82
|
||||
#define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
|
||||
```
|
||||
- 执行`source /usr/local/Ascend/ascend-toolkit/set_env.sh`(以实际CANN-TOOLKIT安装路径为准)。
|
||||
- 执行`apt install cmake libhwloc-dev pkg-config`安装依赖。
|
||||
- 修改项目目录下 /ktransformers/config/config.yaml 中attn部分的page_size: 128 chunk_size: 16384
|
||||
- 执行`USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh`,等待安装完成。
|
||||
***执行安装命令之前,需要将`./ktransformers/configs/config.yaml`中对于page size的设置改为page size=128(因为attn计算算子`torch_npu.npu_fused_infer_attention_score`支持page_size=16/128)***
|
||||
|
||||
此处给出示例balance_serve的启动脚本(由于使用了相对路径,需将该脚本放至项目的根路径下):
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
export USE_MERGE=0
|
||||
export INF_NAN_MODE_FORCE_DISABLE=1
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
export RANK=0
|
||||
export LOCAL_WORLD_SIZE=1
|
||||
#export PROF_DECODE=1
|
||||
#export PROF_PREFILL=1
|
||||
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh
|
||||
|
||||
python ktransformers/server/main.py \
|
||||
--port 10002 \
|
||||
--model_path <your model path> \
|
||||
--gguf_path <your model path> \
|
||||
--cpu_infer 48 \
|
||||
--optimize_config_path ./ktransformers/optimize/optimize_rules/npu/Qwen3-Chat-300IA2-npu-serve.yaml \
|
||||
--max_new_tokens 1024 \
|
||||
--cache_lens 16384 \
|
||||
--max_batch_size 4 \
|
||||
--use_cuda_graph \
|
||||
--tp 1 \
|
||||
--backend_type balance_serve
|
||||
```
|
||||
|
||||
相关参数说明:
|
||||
|
||||
- `--model_path`:kTransformers原生参数,str,此处用来指定合并后的模型文件路径
|
||||
- `--gguf_path`:kTransformers原生参数,str,此处用来指定合并后的模型文件路径
|
||||
- `--cpu_infer`:kTransformers原生参数,int,用来控制CPU侧实际worker线程数,非必选
|
||||
- `--optimize_config_path`:kTransformers原生参数,str,用来指定所用的模型优化配置文件,需要注意相对路径的使用,此处为**必选**
|
||||
- `--cache_lens`:调度器申请 kvcache 的总长度。所有请求共享指定数量(例如 `20480`)的 tokens 对应的 kvcache 空间,请求完成后会释放其所占用的 kvcache 空间,非必选
|
||||
- `--use_cuda_graph`:kTransformers原生参数,bool,为True表示开启图下沉,为False表示关闭图下沉,非必选
|
||||
- `--max_new_tokens`:kTransformers原生参数,int,当统计到输出的tokens数量达到该值时,会直接中止输出,非必选
|
||||
- `--tp`:新增参数,int,用于开启tensor model parallel功能,目前local_chat只支持tp大小与ws大小相同(不支持local_chat使用多dp),非必选
|
||||
|
||||
|
||||
# 其他问题
|
||||
|
||||
## 可能存在的其他依赖问题
|
||||
|
||||
ImportError: libhccl.so: cannot open shared object file: No such file or directory
|
||||
|
||||
```bash
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh # 以实际CANN安装路径为准
|
||||
```
|
||||
|
||||
ImportError: libascend_hal.so: cannot open shared object file: No such file or directory
|
||||
|
||||
```bash
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH # 以实际Driver安装路径为准
|
||||
```
|
||||
Reference in New Issue
Block a user