diff --git a/archive/ktransformers/models/ascend/custom_ascend_modelling_qwen3.py b/archive/ktransformers/models/ascend/custom_ascend_modeling_qwen3.py similarity index 100% rename from archive/ktransformers/models/ascend/custom_ascend_modelling_qwen3.py rename to archive/ktransformers/models/ascend/custom_ascend_modeling_qwen3.py diff --git a/archive/ktransformers/models/custom_cache.py b/archive/ktransformers/models/custom_cache.py index 2f5c529..4a16f36 100644 --- a/archive/ktransformers/models/custom_cache.py +++ b/archive/ktransformers/models/custom_cache.py @@ -12,10 +12,6 @@ import torch.nn as nn import transformers from transformers import Cache, PretrainedConfig from typing import List, Optional, Dict, Any, Tuple -try: - from ktransformers.server.balance_serve.settings import sched_ext -except: - print("no balance_serve") try: import torch_npu @@ -26,6 +22,7 @@ try: except: use_torch_npu = False from transformers.models.llama.modeling_llama import LlamaDecoderLayer +from ktransformers.server.balance_serve.settings import sched_ext class StaticCache(transformers.StaticCache): """ @@ -544,144 +541,6 @@ class KVC2Qwen3Cache(nn.Module): self.k_caches = [] self.v_caches = [] - # 环境变量控制日志/调试 - self.debug_load = os.environ.get("KTRANS_DEBUG_KV_LOAD", "0") == "1" - self.debug_update = os.environ.get("KTRANS_DEBUG_KV_UPDATE", "0") == "1" - - # ------------------------- 调试工具(按 env 开关) ------------------------- - - def _debug_dump_page_layout( - self, - page_idx: torch.Tensor, # [B, Q] 或 [N] - page_offset: torch.Tensor, # 同上 - bsz: int, - q_len: int, - layer_idx: int, - ): - # graph capture 时跳过 - try: - if hasattr(torch.npu, "is_current_stream_capturing") and torch.npu.is_current_stream_capturing(): - return - except Exception: - pass - - page_size = self.page_size - - pi = page_idx.detach().to("cpu", torch.long).reshape(bsz, q_len) - po = page_offset.detach().to("cpu", torch.long).reshape(bsz, q_len) - - for b in range(bsz): - row_pi = pi[b] # [Q] - row_po = po[b] # [Q] - - unique_pages = sorted(set(row_pi.tolist())) - print(f"[DEBUG-LAYOUT] layer={layer_idx}, batch={b}: pages used = {unique_pages}", flush=True) - - for p in unique_pages: - mask = (row_pi == p) - if not mask.any(): - continue - offsets = row_po[mask] - - cnt = int(mask.sum()) - min_off = int(offsets.min()) - max_off = int(offsets.max()) - - print( - f" page {p}: count={cnt}, " - f"offset[min,max]=[{min_off}, {max_off}]", - flush=True, - ) - - # Q 比较小的时候,画一个 ASCII 条形图(仅看前 64 个位置) - if page_size <= 64 and q_len <= 64: - bar = ['.'] * page_size - for off in offsets.tolist(): - if 0 <= off < page_size: - bar[off] = 'X' - print(f" layout: [{''.join(bar)}]", flush=True) - - def _debug_check_page_mapping( - self, - page_idx: torch.Tensor, - page_offset: torch.Tensor, - bsz: int, - q_len: int, - layer_idx: int, - ): - """ - 看 (page_idx, page_offset) 展开后的 global_pos 是否合理。 - global_pos = page_idx * page_size + page_offset - """ - page_size = self.page_size - - pi = page_idx.to(torch.long).reshape(-1) - po = page_offset.to(torch.long).reshape(-1) - N = bsz * q_len - - if pi.numel() != N: - print( - f"[DEBUG-PAGE][WARN] layer={layer_idx}: " - f"numel(page_idx)={pi.numel()} != bsz*q_len={N}", - flush=True, - ) - return - - gpos = pi * page_size + po # [N] - - n_show = gpos.numel() - print( - f"[DEBUG-PAGE] layer={layer_idx}, first {n_show} global_pos=" - f"{gpos[:n_show].tolist()}", - flush=True, - ) - - if gpos.numel() > 1: - diffs = gpos[1:] - gpos[:-1] - print( - f"[DEBUG-PAGE] layer={layer_idx}, " - f"global_pos diff min={int(diffs.min())}, max={int(diffs.max())}", - flush=True, - ) - - if not torch.all(diffs >= 0): - print( - f"[DEBUG-PAGE][WARN] layer={layer_idx}: " - f"global_pos not non-decreasing!", - flush=True, - ) - - def _debug_verify_k_roundtrip( - self, - flat_k: torch.Tensor, # [N, KvH, Dh] - layer_idx: int, - page_idx: torch.Tensor, - page_offset: torch.Tensor, - ): - k_out = self.k_caches[layer_idx] # [num_pages, page_size, KvH, Dh] - - pi = page_idx.to(torch.long).reshape(-1) - po = page_offset.to(torch.long).reshape(-1) - - fetched = k_out[pi, po] # [N, KvH, Dh] - - if fetched.shape != flat_k.shape: - print( - f"[DEBUG-KV][WARN] layer={layer_idx}: " - f"fetched.shape={tuple(fetched.shape)} != flat_k.shape={tuple(flat_k.shape)}", - flush=True, - ) - return - - diff = (fetched - flat_k).abs() - max_diff = diff.max().item() - mean_diff = diff.mean().item() - - print( - f"[DEBUG-KV] layer={layer_idx}: K roundtrip max_abs_diff={max_diff}, " - f"mean_abs_diff={mean_diff}", - flush=True, - ) # ------------------------- 绑定到底层 kvc2 pool ------------------------- @@ -708,14 +567,6 @@ class KVC2Qwen3Cache(nn.Module): self.k_caches.append(k_buf) self.v_caches.append(v_buf) - if self.debug_load: - print( - f"[KV-CACHE-LOAD] layer={i}, " - f"k_cache shape={tuple(k_buf.shape)}, " - f"v_cache shape={tuple(v_buf.shape)}, dtype={k_buf.dtype}", - flush=True, - ) - # num_pages * page_size self.max_cache_len = self.k_caches[0].shape[0] * self.k_caches[0].shape[1] @@ -740,22 +591,8 @@ class KVC2Qwen3Cache(nn.Module): k_out = self.k_caches[layer_idx] v_out = self.v_caches[layer_idx] - if self.debug_update: - print( - "[KV-UPDATE]", - f"layer={layer_idx}, key={tuple(key_states.shape)}, value={tuple(value_states.shape)}, " - f"page_idx shape={tuple(page_idx.shape)}, page_offset shape={tuple(page_offset.shape)}, " - f"k_out shape={tuple(k_out.shape)}, k_out.dtype={k_out.dtype}", - flush=True, - ) - # -------- 1) 修正维度顺序:[B, KvH, Q, D] -> [B, Q, KvH, D] -------- if key_states.dim() == 4 and key_states.shape[1] == self.num_kv_heads: - if self.debug_update: - print( - "[KV-UPDATE] detected layout [B, KvH, Q, D], transpose -> [B, Q, KvH, D]", - flush=True, - ) key_states = key_states.transpose(1, 2).contiguous() value_states = value_states.transpose(1, 2).contiguous() @@ -773,13 +610,6 @@ class KVC2Qwen3Cache(nn.Module): bsz, q_len, kv_heads, head_dim = key_states.shape - # if self.debug_update: - # print( - # "[KV-UPDATE] after layout fix:", - # f"bsz={bsz}, q_len={q_len}, kv_heads={kv_heads}, head_dim={head_dim}", - # flush=True, - # ) - if kv_heads != self.num_kv_heads or head_dim != self.head_dim: raise ValueError( f"[KVC2Qwen3Cache] KV shape mismatch: " @@ -787,29 +617,6 @@ class KVC2Qwen3Cache(nn.Module): f"expected num_kv_heads={self.num_kv_heads}, head_dim={self.head_dim}" ) - # ================== DEBUG:检查 page 映射 ================== - if os.environ.get("KTRANS_DEBUG_PAGE", "0") == "1": - try: - if not torch.npu.is_current_stream_capturing(): - self._debug_check_page_mapping( - page_idx, - page_offset, - bsz=bsz, - q_len=q_len, - layer_idx=layer_idx, - ) - except Exception: - pass - - if os.environ.get("KTRANS_DEBUG_LAYOUT", "0") == "1": - self._debug_dump_page_layout( - page_idx, - page_offset, - bsz=bsz, - q_len=q_len, - layer_idx=layer_idx, - ) - # -------- 2) flatten page_idx / page_offset 为一维 -------- page_idx = page_idx.reshape(-1) page_offset = page_offset.reshape(-1) @@ -819,40 +626,11 @@ class KVC2Qwen3Cache(nn.Module): flat_k = key_states.to(val_dtype).reshape(-1, kv_heads, head_dim) flat_v = value_states.to(val_dtype).reshape(-1, kv_heads, head_dim) - # if self.debug_update: - # print( - # "[KV-UPDATE] flat_k.shape=", - # tuple(flat_k.shape), - # " flat_v.shape=", - # tuple(flat_v.shape), - # " flat_k.dtype=", - # flat_k.dtype, - # flush=True, - # ) - # -------- 4) 真正写入 K / V -------- # k_out / v_out: [num_pages, page_size, num_kv_heads, head_dim] k_out[page_idx, page_offset] = flat_k v_out[page_idx, page_offset] = flat_v - # if self.debug_update: - # print(f"[KV-UPDATE] write done for layer {layer_idx}", flush=True) - - # ================== DEBUG:写入后从 cache 读出来对比 ================== - if os.environ.get("KTRANS_DEBUG_KV", "0") == "1": - try: - if not torch.npu.is_current_stream_capturing(): - self._debug_verify_k_roundtrip( - flat_k=flat_k, - layer_idx=layer_idx, - page_idx=page_idx, - page_offset=page_offset, - ) - except Exception: - pass - - return k_out, v_out - # ------------------------- get K/V ------------------------- def get_k_cache(self, layer_idx): return self.k_caches[layer_idx] diff --git a/archive/ktransformers/util/custom_gguf.py b/archive/ktransformers/util/custom_gguf.py index e6a5c3d..8457506 100644 --- a/archive/ktransformers/util/custom_gguf.py +++ b/archive/ktransformers/util/custom_gguf.py @@ -667,7 +667,7 @@ def translate_name_to_gguf(name): name = translate_name_to_gguf_mixtral(name) if ".ffn_gate_exp." in name: - name = name.replace(".ffn_gate_exp.", ".ffn_gate_exps.") + name = name.replace(".ffn_gate_exp.", ".ffn_gate_exps.") if ".ffn_up_exp." in name: name = name.replace(".ffn_up_exp.", ".ffn_up_exps.") if ".ffn_down_exp." in name: diff --git a/doc/zh/Qwen3-MoE_tutorial_zh_for_Ascend_NPU.md b/doc/zh/Qwen3-MoE_tutorial_zh_for_Ascend_NPU.md new file mode 100644 index 0000000..335b846 --- /dev/null +++ b/doc/zh/Qwen3-MoE_tutorial_zh_for_Ascend_NPU.md @@ -0,0 +1,118 @@ +# 基准测试结果(输出token长度均设置1k, 单并发) + +| Prompt length | 1K | 2K | 4K | +| --------------------------------- | ------ | ------ | ------ | +| KTrans Prefill token/s | 134.11 | 141.60 | 143.42 | +| KTrans Decode token/s | 11.05 | 10.74 | 10.68 | + +## 先决条件 +我们在以下配置下进行了Qwen3-235B-A22B MoE最佳性能测试: +- 服务器型号:Atlas 2UP +- NPU:Atlas 300I A2 +- CPU: HUAWEI Kunpeng 920 7270Z +- 内存: DDR5服务器内存(1TB) + +# 部署 + +***关于部署过程,此README中只额外描述与同级目录下 `DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md` 不同的部分*** + +## 物理机安装 + +部署满血版Qwen3-MoE,需要机器物理内存能够存放下全部路由专家的权重,约200GB。 + +目前支持的NPU型号:**300I A2**。 + +在技术人员的支持下完成硬件安装。 + + +## 权重准备 + +目前,为了满足性能和精度的要求,我们需要准备两份权重,并使用提供的权重合并脚本对权重进行合并,最终只会使用合并后的权重。 + +Q4权重:[Qwen3-235B-A22B-Instruct-2507-GGUF](https://modelscope.cn/models/unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF/files) + +W8A8权重:[Qwen3-235B-A22B-w8a8](https://modelers.cn/models/Modelers_Park/Qwen3-235B-A22B-w8a8) + +使用[merge_safetensor_gguf_for_qwen3.py](../../merge_tensors/merge_safetensor_gguf_for_qwen3.py)来合并Q4和W8A8权重: + +```bash +python merge_safetensor_gguf_for_qwen3.py --safetensor_path /mnt/weights/Qwen3-235B-A22B-Q4_K_M --gguf_path /mnt/weights/Qwen3-235B-A22B-W8A8 --output_path /mnt/weights/Qwen3-235B-A22B-q4km-w8a8 +``` + +## kTransformers部署 + +将项目文件部署到机器上: + +- 初始化third_party。由于此过程耗时较多,且容易受网络影响导致仓库克隆失败,建议初始化一次后,将相关文件进行打包,以便后续直接解压使用。 + ```bash + git clone https://github.com/kvcache-ai/ktransformers.git + cd ktransformers + git submodule update --init --recursive + ``` +- 对于arm平台,注释掉`./third_party/llamafile/iqk_mul_mat_arm82.cpp`中的 + ```cpp + #define iqk_mul_mat iqk_mul_mat_arm82 + #define iqk_mul_mat_moe iqk_mul_mat_moe_arm82 + ``` +- 执行`source /usr/local/Ascend/ascend-toolkit/set_env.sh`(以实际CANN-TOOLKIT安装路径为准)。 +- 执行`apt install cmake libhwloc-dev pkg-config`安装依赖。 +- 修改项目目录下 /ktransformers/config/config.yaml 中attn部分的page_size: 128 chunk_size: 16384 +- 执行`USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh`,等待安装完成。 + ***执行安装命令之前,需要将`./ktransformers/configs/config.yaml`中对于page size的设置改为page size=128(因为attn计算算子`torch_npu.npu_fused_infer_attention_score`支持page_size=16/128)*** + +此处给出示例balance_serve的启动脚本(由于使用了相对路径,需将该脚本放至项目的根路径下): + +```bash +#!/bin/bash +export USE_MERGE=0 +export INF_NAN_MODE_FORCE_DISABLE=1 +export TASK_QUEUE_ENABLE=0 +export RANK=0 +export LOCAL_WORLD_SIZE=1 +#export PROF_DECODE=1 +#export PROF_PREFILL=1 + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh + +python ktransformers/server/main.py \ +--port 10002 \ +--model_path \ +--gguf_path \ +--cpu_infer 48 \ +--optimize_config_path ./ktransformers/optimize/optimize_rules/npu/Qwen3-Chat-300IA2-npu-serve.yaml \ +--max_new_tokens 1024 \ +--cache_lens 16384 \ +--max_batch_size 4 \ +--use_cuda_graph \ +--tp 1 \ +--backend_type balance_serve +``` + +相关参数说明: + +- `--model_path`:kTransformers原生参数,str,此处用来指定合并后的模型文件路径 +- `--gguf_path`:kTransformers原生参数,str,此处用来指定合并后的模型文件路径 +- `--cpu_infer`:kTransformers原生参数,int,用来控制CPU侧实际worker线程数,非必选 +- `--optimize_config_path`:kTransformers原生参数,str,用来指定所用的模型优化配置文件,需要注意相对路径的使用,此处为**必选** +- `--cache_lens`:调度器申请 kvcache 的总长度。所有请求共享指定数量(例如 `20480`)的 tokens 对应的 kvcache 空间,请求完成后会释放其所占用的 kvcache 空间,非必选 +- `--use_cuda_graph`:kTransformers原生参数,bool,为True表示开启图下沉,为False表示关闭图下沉,非必选 +- `--max_new_tokens`:kTransformers原生参数,int,当统计到输出的tokens数量达到该值时,会直接中止输出,非必选 +- `--tp`:新增参数,int,用于开启tensor model parallel功能,目前local_chat只支持tp大小与ws大小相同(不支持local_chat使用多dp),非必选 + + +# 其他问题 + +## 可能存在的其他依赖问题 + +ImportError: libhccl.so: cannot open shared object file: No such file or directory + +```bash +source /usr/local/Ascend/ascend-toolkit/set_env.sh # 以实际CANN安装路径为准 +``` + +ImportError: libascend_hal.so: cannot open shared object file: No such file or directory + +```bash +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH # 以实际Driver安装路径为准 +```