Files
ktransformers/kt-kernel/examples/test_deepseekv3.py
2025-12-17 19:46:32 +08:00

470 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os, sys
import time
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
import sys
import json
from pathlib import Path
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForCausalLM,
GenerationConfig,
TextStreamer,
)
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
# load_layers = 6
load_layers = None
CPUInfer = kt_kernel_ext.CPUInfer(304)
max_qlen = 4096
max_kvlen = 4096
page_size = 256
pages_count = 200
def read_gguf_file(gguf_file_path):
"""
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
Parameters:
- gguf_file_path: Path to the GGUF file.
"""
reader = GGUFReader(gguf_file_path)
# List all key-value pairs in a columnized format
# print("Key-Value Pairs:") # noqa: NP100
# max_key_length = max(len(key) for key in reader.fields.keys())
for key, field in reader.fields.items():
value = field.parts[field.data[0]]
# print(f"{key:{max_key_length}} : {value}") # noqa: NP100
# print("----") # noqa: NP100
# List all tensors
# print("Tensors:") # noqa: NP100
# tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
# print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
# print("-" * 80) # noqa: NP100
re = []
for tensor in reader.tensors:
shape_str = "x".join(map(str, tensor.shape))
size_str = str(tensor.n_elements)
quantization_str = tensor.tensor_type.name
# print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
re.append(tensor)
return re
def read_gguf_directory(directory):
"""
Reads all GGUF files in a directory and prints their contents.
Parameters:
- directory: Path to the directory containing GGUF files.
"""
if not os.path.isdir(directory):
logger.error(f"Directory {directory} does not exist.")
return
# List all GGUF files in the directory
files = [f for f in os.listdir(directory) if f.endswith(".gguf")]
if not files:
logger.info(f"No GGUF files found in {directory}.")
return
re = []
for file in files:
file_path = os.path.join(directory, file)
# print(f"Reading {file_path}:") # noqa: NP100
# print("\n") # noqa: NP100
re.extend(read_gguf_file(file_path))
re = {r.name: r for r in re}
return re
def find_weights(name, weights):
"""
Finds and returns the weights for a given name from the list of weights.
Parameters:
- name: The name of the weights to find.
- weights: List of weight tensors.
Returns:
- The weight tensor if found, otherwise None.
"""
for weight in weights:
if weight.name == name:
return weight
raise ValueError(f"Weight with name {name} not found in the provided weights list.")
def get_torch_tensor_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous()
def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name
def type_to_ggml_type(type):
if type == "F32":
return ggml_type.FP32
elif type == "F16":
return ggml_type.FP16
elif type == "BF16":
return ggml_type.BF16
else:
raise ValueError(f"Unsupported data type: {type}")
def build_mla(layer_idx, json_config, gguf_weights):
hidden_size = json_config["hidden_size"]
num_heads = json_config["num_attention_heads"]
q_lora_rank = json_config["q_lora_rank"]
kv_lora_rank = json_config["kv_lora_rank"]
nope_size = json_config["qk_nope_head_dim"]
rope_size = json_config["qk_rope_head_dim"]
max_position_embeddings = json_config["max_position_embeddings"]
rope_theta = json_config["rope_theta"]
rope_scaling = json_config["rope_scaling"]
config = kt_kernel_ext.mla.MLAConfig(
hidden_size,
q_lora_rank,
kv_lora_rank,
num_heads,
nope_size,
rope_size,
)
config.max_qlen = max_qlen
config.max_kvlen = max_kvlen
config.max_position_embeddings = max_position_embeddings
config.rope_scaling_factor = rope_scaling["factor"]
config.rope_theta = rope_theta
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
config.rope_scaling_mscale = rope_scaling["mscale"]
config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
q_a_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a.weight")
config.q_a_proj = q_a_proj_weight.data_ptr()
config.q_a_proj_type = type_to_ggml_type(type)
q_a_type = type
q_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a_norm.weight")
config.q_a_norm = q_a_norm_weight.data_ptr()
config.q_a_norm_type = type_to_ggml_type(type)
q_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_b.weight")
config.q_b_proj = q_b_proj_weight.data_ptr()
config.q_b_proj_type = type_to_ggml_type(type)
kv_a_proj_with_mqa_weight, type = get_torch_tensor_and_type_from_gguf(
gguf_weights, f"blk.{layer_idx}.attn_kv_a_mqa.weight"
)
config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
config.kv_a_proj_with_mqa_type = type_to_ggml_type(type)
kv_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_a_norm.weight")
config.kv_a_norm = kv_a_norm_weight.data_ptr()
config.kv_a_norm_type = type_to_ggml_type(type)
kv_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_b.weight")
config.kv_b_proj = kv_b_proj_weight.data_ptr()
config.kv_b_proj_type = type_to_ggml_type(type)
o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
config.o_proj = o_proj_weight.data_ptr()
config.w_o_type = type_to_ggml_type(type)
config.layer_idx = layer_idx
config.pool = CPUInfer.backend_
config.page_count = pages_count
if q_a_type == "F32":
mla = kt_kernel_ext.mla.MLA_F32(config)
elif q_a_type == "F16":
mla = kt_kernel_ext.mla.MLA_F16(config)
elif q_a_type == "BF16":
# mla = kt_kernel_ext.mla.MLA_F32(config)
mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
else:
raise ValueError(f"Unsupported data type: {q_a_type}")
mla.load_weights()
mla.set_local_pages(pages_count)
return mla
def build_ffn(layer_idx, json_config, gguf_weights):
if f"blk.{layer_idx}.ffn_gate.weight" in gguf_weights: # dense
config = kt_kernel_ext.moe.MOEConfig(
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
json_config["hidden_size"],
json_config["moe_intermediate_size"],
)
config.layer_idx = layer_idx
config.max_len = max_qlen
config.pool = CPUInfer.backend_
gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate.weight")
up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up.weight")
down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down.weight")
config.gate_proj = gate.data_ptr()
config.gate_type = type_to_ggml_type(gate_type)
config.up_proj = up.data_ptr()
config.up_type = type_to_ggml_type(up_type)
config.down_proj = down.data_ptr()
config.down_type = type_to_ggml_type(down_type)
moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
moe.load_weights()
return moe
elif f"blk.{layer_idx}.ffn_gate_exps.weight" in gguf_weights:
config = kt_kernel_ext.moe.MOEConfig(
json_config["n_routed_experts"] + json_config["n_shared_experts"],
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
json_config["hidden_size"],
json_config["moe_intermediate_size"],
)
config.layer_idx = layer_idx
config.max_len = max_qlen
config.pool = CPUInfer.backend_
gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_exps.weight")
up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_exps.weight")
down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down_exps.weight")
gate_sh, gate_sh_type = get_torch_tensor_and_type_from_gguf(
gguf_weights, f"blk.{layer_idx}.ffn_gate_shexp.weight"
)
up_sh, up_sh_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_shexp.weight")
down_sh, down_sh_type = get_torch_tensor_and_type_from_gguf(
gguf_weights, f"blk.{layer_idx}.ffn_down_shexp.weight"
)
gate_sh_expanded = gate_sh.unsqueeze(0)
gate = torch.cat([gate, gate_sh_expanded], dim=0).contiguous()
up_sh_expanded = up_sh.unsqueeze(0)
up = torch.cat([up, up_sh_expanded], dim=0).contiguous()
down_sh_expanded = down_sh.unsqueeze(0)
down = torch.cat([down, down_sh_expanded], dim=0).contiguous()
config.gate_proj = gate.data_ptr()
config.gate_type = type_to_ggml_type(gate_type)
config.up_proj = up.data_ptr()
config.up_type = type_to_ggml_type(up_type)
config.down_proj = down.data_ptr()
config.down_type = type_to_ggml_type(down_type)
moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
moe.load_weights()
return moe
else:
raise ValueError(f"Unsupported FFN type for layer {layer_idx}")
def build_moegate(layer_idx, json_config, gguf_weights):
config = kt_kernel_ext.gate.GateConfig(
json_config["hidden_size"],
json_config["num_experts_per_tok"],
json_config["n_routed_experts"],
json_config["n_group"],
json_config["topk_group"],
)
config.routed_scaling_factor = json_config["routed_scaling_factor"]
config.pool = CPUInfer.backend_
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
config.weight = weight.data_ptr()
config.weight_type = type_to_ggml_type(weight_type)
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
config.e_score_correction_bias = bias.data_ptr()
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
gate = kt_kernel_ext.gate.MoEGate(config)
return gate
def build_llm(json_config, gguf_weights):
general_config = kt_kernel_ext.GeneralConfig()
general_config.vocab_size = json_config["vocab_size"]
general_config.hidden_size = json_config["hidden_size"]
general_config.num_experts_per_tok = json_config["num_experts_per_tok"]
general_config.n_routed_experts = json_config["n_routed_experts"]
general_config.n_shared_experts = json_config["n_shared_experts"]
general_config.max_qlen = max_qlen
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
general_config.lm_heads_ptr = lm_heads.data_ptr()
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
general_config.norm_weights_ptr = output_norm.data_ptr()
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
general_config.token_embd_ptr = token_embd.data_ptr()
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
general_config.pool = CPUInfer.backend_
llm = kt_kernel_ext.DeepseekV3ForCausalLM(general_config)
model = kt_kernel_ext.DeepseekV3Model(general_config)
llm.model = model
decoder_layers = []
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
for i in range(real_load_layers):
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
layer.load_norm(
attn_norm.data_ptr(),
type_to_ggml_type(attn_norm_type),
ffn_norm.data_ptr(),
type_to_ggml_type(ffn_norm_type),
)
layer.self_attn = build_mla(i, json_config, gguf_weights)
if f"blk.{i}.ffn_gate_inp.weight" in gguf_weights:
layer.gate = build_moegate(i, json_config, gguf_weights)
layer.ffn = build_ffn(i, json_config, gguf_weights)
decoder_layers.append(layer)
model.layers = decoder_layers
return llm
safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)
gguf_path = "/home/bd/models/DeepSeek-R1-BF16"
weights = read_gguf_directory(gguf_path)
weights = dict(sorted(weights.items()))
for name, t in weights.items():
# if not name.startswith("blk"):
# if name.startswith("blk.10."):
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print("Building LLM ...")
load_start_time = time.perf_counter()
llm = build_llm(json_config, weights)
load_end_time = time.perf_counter()
print(f"Load time: {load_end_time - load_start_time:.4f} seconds")
print("Release Weight Tensors ...")
weights = None
print("Loading Configs ...")
tokenizer = AutoTokenizer.from_pretrained(safetensor_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
force_think = False
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
def start_chat(content=None):
if content is None:
content = input("Chat: ")
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
if force_think:
token_thinks = torch.tensor(
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
)
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
input_tensor = input_tensor.squeeze(0) # Add batch dimension
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
kvlen = 0
step = 2
while True or step > 0:
step -= 1
stream = TextStreamer(tokenizer)
qlen = input_tensor.shape[0]
qlens = [qlen - kvlen]
kvlens = [kvlen]
page_tables = [list(range(pages_count))]
start_time = time.perf_counter()
llm.forward(qlens, page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
end_time = time.perf_counter()
print(
f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
)
logits = output_logits[0]
# print(logits)
# sample
next_token = torch.argmax(logits).item()
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
kvlen = input_tensor.shape[0]
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
stream.end()
break
else:
stream.put(torch.tensor([next_token]))
job_id = 0
while True:
try:
# ---------- 让用户决定是否继续 ----------
choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower()
if choice in {"q", "quit", "exit"}:
print("收到退出指令,程序结束。")
break
elif choice == "1":
file_path = input("请输入要读取的文件路径:").strip()
if not Path(file_path).is_file():
print(f"文件 {file_path} 不存在,请检查路径。")
continue
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
print(f"读取到内容:\n{content}\n")
start_chat(content)
else:
start_chat()
except KeyboardInterrupt:
# 随时 Ctrl-C放弃当前任务并重启
print(f"\n检测到 Ctrl-C已终止对话 #{job_id},马上重启…")
except Exception as e:
# 其他异常:打印错误信息并重启
print(f"\n发生错误:{e}\n已终止对话 #{job_id},马上重启…")
logger.error(f"Error in job {job_id}: {e}", exc_info=True)
finally:
job_id += 1 # 不管中断与否,都给下一任务换编号