mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-03-15 02:47:22 +00:00
470 lines
18 KiB
Python
470 lines
18 KiB
Python
import os, sys
|
||
import time
|
||
|
||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||
from kt_kernel import kt_kernel_ext
|
||
from kt_kernel_ext.kvcache import ggml_type
|
||
import torch
|
||
import logging
|
||
import sys
|
||
import json
|
||
from pathlib import Path
|
||
from transformers import (
|
||
AutoTokenizer,
|
||
AutoConfig,
|
||
AutoModelForCausalLM,
|
||
GenerationConfig,
|
||
TextStreamer,
|
||
)
|
||
|
||
logger = logging.getLogger("reader")
|
||
|
||
from gguf.gguf_reader import GGUFReader
|
||
|
||
# load_layers = 6
|
||
load_layers = None
|
||
CPUInfer = kt_kernel_ext.CPUInfer(304)
|
||
max_qlen = 4096
|
||
max_kvlen = 4096
|
||
page_size = 256
|
||
pages_count = 200
|
||
|
||
|
||
def read_gguf_file(gguf_file_path):
|
||
"""
|
||
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
|
||
|
||
Parameters:
|
||
- gguf_file_path: Path to the GGUF file.
|
||
"""
|
||
|
||
reader = GGUFReader(gguf_file_path)
|
||
|
||
# List all key-value pairs in a columnized format
|
||
# print("Key-Value Pairs:") # noqa: NP100
|
||
# max_key_length = max(len(key) for key in reader.fields.keys())
|
||
for key, field in reader.fields.items():
|
||
value = field.parts[field.data[0]]
|
||
# print(f"{key:{max_key_length}} : {value}") # noqa: NP100
|
||
# print("----") # noqa: NP100
|
||
|
||
# List all tensors
|
||
# print("Tensors:") # noqa: NP100
|
||
# tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
|
||
# print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
|
||
# print("-" * 80) # noqa: NP100
|
||
re = []
|
||
for tensor in reader.tensors:
|
||
shape_str = "x".join(map(str, tensor.shape))
|
||
size_str = str(tensor.n_elements)
|
||
quantization_str = tensor.tensor_type.name
|
||
# print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
|
||
re.append(tensor)
|
||
return re
|
||
|
||
|
||
def read_gguf_directory(directory):
|
||
"""
|
||
Reads all GGUF files in a directory and prints their contents.
|
||
|
||
Parameters:
|
||
- directory: Path to the directory containing GGUF files.
|
||
"""
|
||
if not os.path.isdir(directory):
|
||
logger.error(f"Directory {directory} does not exist.")
|
||
return
|
||
|
||
# List all GGUF files in the directory
|
||
files = [f for f in os.listdir(directory) if f.endswith(".gguf")]
|
||
if not files:
|
||
logger.info(f"No GGUF files found in {directory}.")
|
||
return
|
||
|
||
re = []
|
||
for file in files:
|
||
file_path = os.path.join(directory, file)
|
||
# print(f"Reading {file_path}:") # noqa: NP100
|
||
# print("\n") # noqa: NP100
|
||
re.extend(read_gguf_file(file_path))
|
||
re = {r.name: r for r in re}
|
||
return re
|
||
|
||
|
||
def find_weights(name, weights):
|
||
"""
|
||
Finds and returns the weights for a given name from the list of weights.
|
||
|
||
Parameters:
|
||
- name: The name of the weights to find.
|
||
- weights: List of weight tensors.
|
||
|
||
Returns:
|
||
- The weight tensor if found, otherwise None.
|
||
"""
|
||
for weight in weights:
|
||
if weight.name == name:
|
||
return weight
|
||
raise ValueError(f"Weight with name {name} not found in the provided weights list.")
|
||
|
||
|
||
def get_torch_tensor_from_gguf(gguf_weights, name):
|
||
return torch.from_numpy(gguf_weights[name].data).contiguous()
|
||
|
||
|
||
def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
|
||
return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name
|
||
|
||
|
||
def type_to_ggml_type(type):
|
||
if type == "F32":
|
||
return ggml_type.FP32
|
||
elif type == "F16":
|
||
return ggml_type.FP16
|
||
elif type == "BF16":
|
||
return ggml_type.BF16
|
||
else:
|
||
raise ValueError(f"Unsupported data type: {type}")
|
||
|
||
|
||
def build_mla(layer_idx, json_config, gguf_weights):
|
||
hidden_size = json_config["hidden_size"]
|
||
num_heads = json_config["num_attention_heads"]
|
||
q_lora_rank = json_config["q_lora_rank"]
|
||
kv_lora_rank = json_config["kv_lora_rank"]
|
||
nope_size = json_config["qk_nope_head_dim"]
|
||
rope_size = json_config["qk_rope_head_dim"]
|
||
max_position_embeddings = json_config["max_position_embeddings"]
|
||
rope_theta = json_config["rope_theta"]
|
||
rope_scaling = json_config["rope_scaling"]
|
||
|
||
config = kt_kernel_ext.mla.MLAConfig(
|
||
hidden_size,
|
||
q_lora_rank,
|
||
kv_lora_rank,
|
||
num_heads,
|
||
nope_size,
|
||
rope_size,
|
||
)
|
||
config.max_qlen = max_qlen
|
||
config.max_kvlen = max_kvlen
|
||
config.max_position_embeddings = max_position_embeddings
|
||
config.rope_scaling_factor = rope_scaling["factor"]
|
||
config.rope_theta = rope_theta
|
||
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
|
||
config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
|
||
config.rope_scaling_mscale = rope_scaling["mscale"]
|
||
config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
|
||
config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
|
||
|
||
q_a_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a.weight")
|
||
config.q_a_proj = q_a_proj_weight.data_ptr()
|
||
config.q_a_proj_type = type_to_ggml_type(type)
|
||
q_a_type = type
|
||
|
||
q_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a_norm.weight")
|
||
config.q_a_norm = q_a_norm_weight.data_ptr()
|
||
config.q_a_norm_type = type_to_ggml_type(type)
|
||
|
||
q_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_b.weight")
|
||
config.q_b_proj = q_b_proj_weight.data_ptr()
|
||
config.q_b_proj_type = type_to_ggml_type(type)
|
||
|
||
kv_a_proj_with_mqa_weight, type = get_torch_tensor_and_type_from_gguf(
|
||
gguf_weights, f"blk.{layer_idx}.attn_kv_a_mqa.weight"
|
||
)
|
||
config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
|
||
config.kv_a_proj_with_mqa_type = type_to_ggml_type(type)
|
||
|
||
kv_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_a_norm.weight")
|
||
config.kv_a_norm = kv_a_norm_weight.data_ptr()
|
||
config.kv_a_norm_type = type_to_ggml_type(type)
|
||
|
||
kv_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_b.weight")
|
||
config.kv_b_proj = kv_b_proj_weight.data_ptr()
|
||
config.kv_b_proj_type = type_to_ggml_type(type)
|
||
|
||
o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
|
||
config.o_proj = o_proj_weight.data_ptr()
|
||
config.w_o_type = type_to_ggml_type(type)
|
||
|
||
config.layer_idx = layer_idx
|
||
config.pool = CPUInfer.backend_
|
||
config.page_count = pages_count
|
||
|
||
if q_a_type == "F32":
|
||
mla = kt_kernel_ext.mla.MLA_F32(config)
|
||
elif q_a_type == "F16":
|
||
mla = kt_kernel_ext.mla.MLA_F16(config)
|
||
elif q_a_type == "BF16":
|
||
# mla = kt_kernel_ext.mla.MLA_F32(config)
|
||
mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
|
||
else:
|
||
raise ValueError(f"Unsupported data type: {q_a_type}")
|
||
|
||
mla.load_weights()
|
||
mla.set_local_pages(pages_count)
|
||
return mla
|
||
|
||
|
||
def build_ffn(layer_idx, json_config, gguf_weights):
|
||
if f"blk.{layer_idx}.ffn_gate.weight" in gguf_weights: # dense
|
||
config = kt_kernel_ext.moe.MOEConfig(
|
||
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
|
||
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
|
||
json_config["hidden_size"],
|
||
json_config["moe_intermediate_size"],
|
||
)
|
||
config.layer_idx = layer_idx
|
||
config.max_len = max_qlen
|
||
config.pool = CPUInfer.backend_
|
||
gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate.weight")
|
||
up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up.weight")
|
||
down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down.weight")
|
||
|
||
config.gate_proj = gate.data_ptr()
|
||
config.gate_type = type_to_ggml_type(gate_type)
|
||
config.up_proj = up.data_ptr()
|
||
config.up_type = type_to_ggml_type(up_type)
|
||
config.down_proj = down.data_ptr()
|
||
config.down_type = type_to_ggml_type(down_type)
|
||
|
||
moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
|
||
moe.load_weights()
|
||
return moe
|
||
|
||
elif f"blk.{layer_idx}.ffn_gate_exps.weight" in gguf_weights:
|
||
config = kt_kernel_ext.moe.MOEConfig(
|
||
json_config["n_routed_experts"] + json_config["n_shared_experts"],
|
||
json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
|
||
json_config["hidden_size"],
|
||
json_config["moe_intermediate_size"],
|
||
)
|
||
config.layer_idx = layer_idx
|
||
config.max_len = max_qlen
|
||
config.pool = CPUInfer.backend_
|
||
gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_exps.weight")
|
||
up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_exps.weight")
|
||
down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down_exps.weight")
|
||
|
||
gate_sh, gate_sh_type = get_torch_tensor_and_type_from_gguf(
|
||
gguf_weights, f"blk.{layer_idx}.ffn_gate_shexp.weight"
|
||
)
|
||
up_sh, up_sh_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_shexp.weight")
|
||
down_sh, down_sh_type = get_torch_tensor_and_type_from_gguf(
|
||
gguf_weights, f"blk.{layer_idx}.ffn_down_shexp.weight"
|
||
)
|
||
|
||
gate_sh_expanded = gate_sh.unsqueeze(0)
|
||
gate = torch.cat([gate, gate_sh_expanded], dim=0).contiguous()
|
||
up_sh_expanded = up_sh.unsqueeze(0)
|
||
up = torch.cat([up, up_sh_expanded], dim=0).contiguous()
|
||
down_sh_expanded = down_sh.unsqueeze(0)
|
||
down = torch.cat([down, down_sh_expanded], dim=0).contiguous()
|
||
|
||
config.gate_proj = gate.data_ptr()
|
||
config.gate_type = type_to_ggml_type(gate_type)
|
||
config.up_proj = up.data_ptr()
|
||
config.up_type = type_to_ggml_type(up_type)
|
||
config.down_proj = down.data_ptr()
|
||
config.down_type = type_to_ggml_type(down_type)
|
||
|
||
moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
|
||
moe.load_weights()
|
||
return moe
|
||
|
||
else:
|
||
raise ValueError(f"Unsupported FFN type for layer {layer_idx}")
|
||
|
||
|
||
def build_moegate(layer_idx, json_config, gguf_weights):
|
||
config = kt_kernel_ext.gate.GateConfig(
|
||
json_config["hidden_size"],
|
||
json_config["num_experts_per_tok"],
|
||
json_config["n_routed_experts"],
|
||
json_config["n_group"],
|
||
json_config["topk_group"],
|
||
)
|
||
|
||
config.routed_scaling_factor = json_config["routed_scaling_factor"]
|
||
|
||
config.pool = CPUInfer.backend_
|
||
|
||
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
|
||
config.weight = weight.data_ptr()
|
||
config.weight_type = type_to_ggml_type(weight_type)
|
||
|
||
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
|
||
config.e_score_correction_bias = bias.data_ptr()
|
||
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
|
||
|
||
gate = kt_kernel_ext.gate.MoEGate(config)
|
||
|
||
return gate
|
||
|
||
|
||
def build_llm(json_config, gguf_weights):
|
||
|
||
general_config = kt_kernel_ext.GeneralConfig()
|
||
general_config.vocab_size = json_config["vocab_size"]
|
||
general_config.hidden_size = json_config["hidden_size"]
|
||
general_config.num_experts_per_tok = json_config["num_experts_per_tok"]
|
||
general_config.n_routed_experts = json_config["n_routed_experts"]
|
||
general_config.n_shared_experts = json_config["n_shared_experts"]
|
||
general_config.max_qlen = max_qlen
|
||
|
||
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
|
||
general_config.lm_heads_ptr = lm_heads.data_ptr()
|
||
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
|
||
|
||
output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
|
||
general_config.norm_weights_ptr = output_norm.data_ptr()
|
||
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
|
||
|
||
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
|
||
general_config.token_embd_ptr = token_embd.data_ptr()
|
||
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
|
||
|
||
general_config.pool = CPUInfer.backend_
|
||
|
||
llm = kt_kernel_ext.DeepseekV3ForCausalLM(general_config)
|
||
model = kt_kernel_ext.DeepseekV3Model(general_config)
|
||
llm.model = model
|
||
|
||
decoder_layers = []
|
||
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
|
||
|
||
for i in range(real_load_layers):
|
||
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
|
||
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
|
||
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
|
||
|
||
layer.load_norm(
|
||
attn_norm.data_ptr(),
|
||
type_to_ggml_type(attn_norm_type),
|
||
ffn_norm.data_ptr(),
|
||
type_to_ggml_type(ffn_norm_type),
|
||
)
|
||
layer.self_attn = build_mla(i, json_config, gguf_weights)
|
||
if f"blk.{i}.ffn_gate_inp.weight" in gguf_weights:
|
||
layer.gate = build_moegate(i, json_config, gguf_weights)
|
||
layer.ffn = build_ffn(i, json_config, gguf_weights)
|
||
decoder_layers.append(layer)
|
||
|
||
model.layers = decoder_layers
|
||
return llm
|
||
|
||
|
||
safetensor_path = "/home/bd/models/DeepSeek-R1"
|
||
json_path = os.path.join(safetensor_path, "config.json")
|
||
json_config = json.load(open(json_path, "r"))
|
||
print(json_config)
|
||
|
||
gguf_path = "/home/bd/models/DeepSeek-R1-BF16"
|
||
weights = read_gguf_directory(gguf_path)
|
||
weights = dict(sorted(weights.items()))
|
||
|
||
|
||
for name, t in weights.items():
|
||
# if not name.startswith("blk"):
|
||
# if name.startswith("blk.10."):
|
||
# if "ffn_gate." in name:
|
||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||
print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||
|
||
print("Building LLM ...")
|
||
load_start_time = time.perf_counter()
|
||
llm = build_llm(json_config, weights)
|
||
load_end_time = time.perf_counter()
|
||
print(f"Load time: {load_end_time - load_start_time:.4f} seconds")
|
||
|
||
print("Release Weight Tensors ...")
|
||
weights = None
|
||
print("Loading Configs ...")
|
||
|
||
|
||
tokenizer = AutoTokenizer.from_pretrained(safetensor_path, trust_remote_code=True)
|
||
config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
|
||
|
||
force_think = False
|
||
|
||
|
||
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
|
||
|
||
|
||
def start_chat(content=None):
|
||
if content is None:
|
||
content = input("Chat: ")
|
||
|
||
messages = [{"role": "user", "content": content}]
|
||
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
|
||
if force_think:
|
||
token_thinks = torch.tensor(
|
||
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
|
||
)
|
||
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
|
||
input_tensor = input_tensor.squeeze(0) # Add batch dimension
|
||
|
||
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
|
||
kvlen = 0
|
||
step = 2
|
||
while True or step > 0:
|
||
step -= 1
|
||
stream = TextStreamer(tokenizer)
|
||
|
||
qlen = input_tensor.shape[0]
|
||
qlens = [qlen - kvlen]
|
||
kvlens = [kvlen]
|
||
page_tables = [list(range(pages_count))]
|
||
start_time = time.perf_counter()
|
||
llm.forward(qlens, page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
|
||
end_time = time.perf_counter()
|
||
print(
|
||
f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
|
||
)
|
||
|
||
logits = output_logits[0]
|
||
# print(logits)
|
||
# sample
|
||
next_token = torch.argmax(logits).item()
|
||
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
|
||
kvlen = input_tensor.shape[0]
|
||
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
|
||
|
||
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
|
||
stream.end()
|
||
break
|
||
else:
|
||
stream.put(torch.tensor([next_token]))
|
||
|
||
|
||
job_id = 0
|
||
while True:
|
||
try:
|
||
# ---------- 让用户决定是否继续 ----------
|
||
choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower()
|
||
if choice in {"q", "quit", "exit"}:
|
||
print("收到退出指令,程序结束。")
|
||
break
|
||
elif choice == "1":
|
||
file_path = input("请输入要读取的文件路径:").strip()
|
||
if not Path(file_path).is_file():
|
||
print(f"文件 {file_path} 不存在,请检查路径。")
|
||
continue
|
||
with open(file_path, "r", encoding="utf-8") as file:
|
||
content = file.read()
|
||
print(f"读取到内容:\n{content}\n")
|
||
start_chat(content)
|
||
else:
|
||
start_chat()
|
||
|
||
except KeyboardInterrupt:
|
||
# 随时 Ctrl-C:放弃当前任务并重启
|
||
print(f"\n检测到 Ctrl-C,已终止对话 #{job_id},马上重启…")
|
||
except Exception as e:
|
||
# 其他异常:打印错误信息并重启
|
||
print(f"\n发生错误:{e}\n已终止对话 #{job_id},马上重启…")
|
||
logger.error(f"Error in job {job_id}: {e}", exc_info=True)
|
||
finally:
|
||
job_id += 1 # 不管中断与否,都给下一任务换编号
|