[fix](test): fix import kt-kernel (#1728)

This commit is contained in:
ErvinXie
2025-12-17 19:46:32 +08:00
committed by GitHub
parent 6fc4080a7d
commit a8667ddb58
33 changed files with 1063 additions and 1151 deletions

View File

@@ -13,7 +13,7 @@ import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
layer_num = 10
@@ -61,11 +61,7 @@ def bench_linear(cache_seqlen: int):
max_thread_num,
)
local_kvcache = kt_kernel_ext.kvcache.KVCache(config)
block_table = (
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
.contiguous()
.view(1, -1)
)
block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1)
for layer_idx in range(layer_num):
k_cache = torch.randn(
@@ -93,17 +89,11 @@ def bench_linear(cache_seqlen: int):
)
CPUInfer.sync()
input = torch.randn(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
output = torch.empty(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
# attn_lse: (bsz, q_len, q_head_num)
attn_lse = torch.empty(
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
).contiguous()
attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous()
input = input / 100
# warm up
@@ -156,16 +146,7 @@ def bench_linear(cache_seqlen: int):
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
cache_seqlen
* kv_head_num
* head_dim
* 2
* 2
* test_iter
/ total_time
/ 1000
/ 1000
/ 1000,
cache_seqlen * kv_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000,
"GB/s",
)
print("")

View File

@@ -13,7 +13,7 @@ import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
layer_num = 10
@@ -45,9 +45,7 @@ def bench_linear(cache_seqlen: int, device):
kvcaches.append((k_cache, v_cache))
input = torch.randn(
(1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
).contiguous()
input = torch.randn((1, q_head_num, 1, head_dim), dtype=torch.float16, device=device).contiguous()
input = input / 100
# warm up
@@ -70,16 +68,7 @@ def bench_linear(cache_seqlen: int, device):
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
cache_seqlen
* q_head_num
* head_dim
* 2
* 2
* test_iter
/ total_time
/ 1000
/ 1000
/ 1000,
cache_seqlen * q_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000,
"GB/s",
)
print("")

View File

@@ -15,7 +15,7 @@ from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
# Benchmark parameters (single MoE, no layer loop)
@@ -29,9 +29,7 @@ warm_up_iter = 1000
test_iter = 5000
k_group_size = 32
physical_to_logical_map = (
torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
)
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
@@ -43,24 +41,12 @@ CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
def get_git_commit():
result = {}
try:
commit = (
subprocess.check_output(["git", "rev-parse", "HEAD"])
.decode("utf-8")
.strip()
)
commit_msg = (
subprocess.check_output(["git", "log", "-1", "--pretty=%B"])
.decode("utf-8")
.strip()
)
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
result["commit"] = commit
result["commit_message"] = commit_msg
dirty_output = (
subprocess.check_output(["git", "status", "--porcelain"])
.decode("utf-8")
.strip()
)
dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
if dirty_output:
result["dirty"] = True
result["dirty_files"] = dirty_output.splitlines()
@@ -132,9 +118,7 @@ def record_results(result, filename=json_path):
f.write(json.dumps(result) + "\n")
def pack_to_int32(
value: torch.Tensor, num_bits: int, packed_dim: int = 1
) -> torch.Tensor:
def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: int = 1) -> torch.Tensor:
if value.dtype is not torch.int8:
raise ValueError("Tensor must be torch.int8 before packing")
if not (1 <= num_bits <= 8):
@@ -181,9 +165,7 @@ def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
weights_f32 = weights.to(torch.float32)
e, rows, cols = weights_f32.shape
if cols % group_size != 0 or cols % 2 != 0:
raise ValueError(
f"cols ({cols}) must be divisible by group_size ({group_size}) and 2"
)
raise ValueError(f"cols ({cols}) must be divisible by group_size ({group_size}) and 2")
reshaped = weights_f32.view(e, rows, cols // group_size, group_size)
max_abs = reshaped.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8)
@@ -191,9 +173,7 @@ def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
q = torch.round(reshaped / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8)
q = q.view(e, rows, cols)
packed = pack_tensor_per_row(q, num_bits=4).view(e, rows, cols // 8).contiguous()
scales = scales.to(torch.bfloat16).contiguous().view(
e, rows, cols // group_size
).contiguous()
scales = scales.to(torch.bfloat16).contiguous().view(e, rows, cols // group_size).contiguous()
return packed, scales
@@ -233,9 +213,7 @@ def bench_k2_moe():
bytes_per_elem = 0.5 + 2.0 / k_group_size
quant_data = build_quantized_layer_weights()
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
config.max_len = max_len
config.quant_config.bits = 4
config.quant_config.group_size = k_group_size
@@ -261,12 +239,8 @@ def bench_k2_moe():
.reshape(gen_iter, qlen * num_experts_per_tok)
.contiguous()
)
weights = torch.rand(
(gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu"
).contiguous()
input_tensor = torch.randn(
(qlen, hidden_size), dtype=torch.bfloat16, device="cpu"
).contiguous()
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
output_tensor = torch.empty_like(input_tensor)
bsz_tensor = torch.tensor([qlen], device="cpu")
@@ -313,17 +287,7 @@ def bench_k2_moe():
/ total_time
/ 1e9
)
flops = (
hidden_size
* intermediate_size
* qlen
* 3
* num_experts_per_tok
* 2
* test_iter
/ total_time
/ 1e12
)
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
print("Quant mode: int4_k2")
print("Time(s): ", total_time)

View File

@@ -14,7 +14,7 @@ from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
# Benchmark parameters (single MoE, mirror examples/test_k2_write_buffer.py)
@@ -39,20 +39,12 @@ CPUInfer = kt_kernel_ext.CPUInfer(96)
def get_git_commit():
result = {}
try:
commit = (
subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
)
commit_msg = (
subprocess.check_output(["git", "log", "-1", "--pretty=%B"])
.decode("utf-8")
.strip()
)
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
result["commit"] = commit
result["commit_message"] = commit_msg
dirty_output = (
subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
)
dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
if dirty_output:
result["dirty"] = True
result["dirty_files"] = dirty_output.splitlines()
@@ -160,9 +152,7 @@ def build_moe():
per_mat_scale_elems,
) = allocate_weights()
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
config.max_len = max_len
config.quant_config.bits = 4
config.quant_config.group_size = group_size
@@ -186,18 +176,10 @@ def build_moe():
total_weight_bytes_per_tp = gpu_experts_num * weight_bytes_per_expert_per_tp
total_scale_elems_per_tp = gpu_experts_num * scale_elems_per_expert_per_tp
w13_weight_bufs = [
torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)
]
w13_scale_bufs = [
torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)
]
w2_weight_bufs = [
torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)
]
w2_scale_bufs = [
torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)
]
w13_weight_bufs = [torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
w13_scale_bufs = [torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)]
w2_weight_bufs = [torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
w2_scale_bufs = [torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)]
buffer_ptrs = {
"w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
@@ -265,8 +247,6 @@ def bench_write_buffer():
time.sleep(0.6)
print(end - start)
time_per_iter_us = total_time / test_iter * 1e6
bandwidth_gbs = bytes_per_call * test_iter / total_time / 1e9

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# coding=utf-8
'''
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:31:59
@@ -8,11 +8,12 @@ Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:35:35
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
input_size = 16384
@@ -25,6 +26,7 @@ CPUInfer = kt_kernel_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000
def bench_linear(quant_mode: str):
with torch.inference_mode(mode=True):
@@ -63,27 +65,25 @@ def bench_linear(quant_mode: str):
proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
bytes_per_elem = 0.257812
else:
assert(False)
assert False
linears = []
projs = []
for _ in range(layer_num):
proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
config = kt_kernel_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
proj = torch.randn((output_size, input_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
config = kt_kernel_ext.linear.LinearConfig(
input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type
)
linear = kt_kernel_ext.linear.Linear(config)
projs.append(proj)
linears.append(linear)
input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
# warm up
for i in range(warm_up_iter):
CPUInfer.submit(
linears[i % layer_num].forward(
qlen,
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr()
)
linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
)
CPUInfer.sync()
@@ -91,21 +91,22 @@ def bench_linear(quant_mode: str):
start = time.perf_counter()
for i in range(test_iter):
CPUInfer.submit(
linears[i % layer_num].forward(
qlen,
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr()
)
linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
)
CPUInfer.sync()
end = time.perf_counter()
total_time = end - start
print('Quant mode: ', quant_mode)
print('Time(s): ', total_time)
print('Iteration: ', test_iter)
print('Time(us) per iteration: ', total_time / test_iter * 1000000)
print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
print('')
print("Quant mode: ", quant_mode)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000,
"GB/s",
)
print("")
bench_linear("fp32")
bench_linear("fp16")

View File

@@ -3,9 +3,10 @@ import time
import subprocess
import platform
import json
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
import kt_kernel_ext
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
@@ -31,7 +32,7 @@ layer_num = 10
rope_theta = 10000
max_qlen = qlen+kvlen
max_qlen = qlen + kvlen
max_kvlen = 4096
max_position_embeddings = 163840
@@ -42,7 +43,7 @@ rope_scaling = {
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
"type": "yarn",
}
CPUINFER_PARAM = 304
@@ -54,13 +55,12 @@ warm_up_iter = 20
test_iter = 100
# 获取脚本相关信息,用于生成结果保存文件名
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, "bench_results "+ ".jsonl")
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")
def get_git_commit():
"""
@@ -100,9 +100,9 @@ def get_system_info():
# 获取 CPU 型号(仅 Linux 支持)
cpu_model = None
if os.path.exists('/proc/cpuinfo'):
if os.path.exists("/proc/cpuinfo"):
try:
with open('/proc/cpuinfo', 'r') as f:
with open("/proc/cpuinfo", "r") as f:
for line in f:
if "model name" in line:
cpu_model = line.split(":", 1)[1].strip()
@@ -113,9 +113,9 @@ def get_system_info():
# 获取内存大小单位GB仅 Linux 支持
mem_total_gb = None
if os.path.exists('/proc/meminfo'):
if os.path.exists("/proc/meminfo"):
try:
with open('/proc/meminfo', 'r') as f:
with open("/proc/meminfo", "r") as f:
for line in f:
if "MemTotal" in line:
mem_kb = float(line.split(":", 1)[1].split()[0])
@@ -149,6 +149,7 @@ def record_results(result, filename=json_path):
with open(filename, "a") as f:
f.write(json.dumps(result) + "\n")
def bench_mla(quant_mode: str):
"""
测试 MLA 模型的性能
@@ -184,9 +185,9 @@ def bench_mla(quant_mode: str):
mlas = []
for i in tqdm(range(layer_num)):
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=torch.float16)
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=torch.float16)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)
init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
@@ -194,11 +195,11 @@ def bench_mla(quant_mode: str):
init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
init.normal_(o_proj.weight, mean=0.0, std=0.02)
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to('cpu').contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to('cpu').contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to('cpu').contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to('cpu').contiguous()
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous()
config = kt_kernel_ext.mla.MLAConfig(
hidden_size,
@@ -231,64 +232,85 @@ def bench_mla(quant_mode: str):
config.kv_b_proj_type = ggml_type.FP16
config.w_o_type = ggml_type.FP16
config.pool = CPUInfer.backend_
mla = kt_kernel_ext.mla.MLA(config)
mla.load_weights()
mla.set_local_pages(pages_count)
mlas.append(mla)
print('Generating data...')
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
print("Generating data...")
input_tensor = (
torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
)
output_tensor = (
torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
)
print('Warming up...')
print("Warming up...")
for i in tqdm(range(warm_up_iter)):
mlas[i%layer_num].forward([qlen],[page_table],[kvlen],
input_tensor[i%layer_num].data_ptr(),output_tensor[i%layer_num].data_ptr())
mlas[i % layer_num].forward(
[qlen],
[page_table],
[kvlen],
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
)
print('Start testing...')
print("Start testing...")
start = time.perf_counter()
for i in tqdm(range(test_iter)):
mlas[i%layer_num].forward([qlen],[page_table],[kvlen],
input_tensor[i%layer_num].data_ptr(),output_tensor[i%layer_num].data_ptr())
mlas[i % layer_num].forward(
[qlen],
[page_table],
[kvlen],
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
)
end = time.perf_counter()
total_time = end - start
time_per_iter_us = (total_time * 1e6) / test_iter
bandwidth = bytes_per_elem * (q_lora_rank * hidden_size
+ (kv_lora_rank+rope_size) * hidden_size
+ (nope_size+rope_size) * q_lora_rank * num_heads
+ (nope_size+nope_size)*kv_lora_rank * num_heads
bandwidth = (
bytes_per_elem
* (
q_lora_rank * hidden_size
+ (kv_lora_rank + rope_size) * hidden_size
+ (nope_size + rope_size) * q_lora_rank * num_heads
+ (nope_size + nope_size) * kv_lora_rank * num_heads
+ hidden_size * nope_size * num_heads
+ hidden_size * qlen) * test_iter / (total_time * 1e9)
flops = 2*(
q_lora_rank*hidden_size*qlen
+ hidden_size * qlen
)
* test_iter
/ (total_time * 1e9)
)
flops = (
2
* (
q_lora_rank * hidden_size * qlen
+ kv_lora_rank * hidden_size * qlen
+num_heads* (nope_size+rope_size)*q_lora_rank*qlen
+ num_heads * (nope_size + rope_size) * q_lora_rank * qlen
+ num_heads * qlen * nope_size * kv_lora_rank
+ num_heads * (kvlen+qlen) * kv_lora_rank * qlen
+ num_heads * rope_size * qlen * (qlen+kvlen)
+ num_heads * (kvlen + qlen) * kv_lora_rank * qlen
+ num_heads * rope_size * qlen * (qlen + kvlen)
+ num_heads * kv_lora_rank * (qlen + kvlen) * qlen
+ num_heads * nope_size * kv_lora_rank * qlen
+ hidden_size * num_heads* nope_size * qlen
) * test_iter / (total_time * 1e12)
+ hidden_size * num_heads * nope_size * qlen
)
* test_iter
/ (total_time * 1e12)
)
print('Quant mode:', quant_mode)
print('Time(s):', total_time)
print('Iteration:', test_iter)
print('Time(us) per iteration:', time_per_iter_us)
print('Bandwidth:', bandwidth, 'GB/s')
print('TFLOPS:', flops)
print('')
print("Quant mode:", quant_mode)
print("Time(s):", total_time)
print("Iteration:", test_iter)
print("Time(us) per iteration:", time_per_iter_us)
print("Bandwidth:", bandwidth, "GB/s")
print("TFLOPS:", flops)
print("")
# 整理测试结果
result = {
@@ -312,21 +334,16 @@ def bench_mla(quant_mode: str):
"q_lora_rank": q_lora_rank,
"nope_size": nope_size,
"rope_size": rope_size,
"layer_num": layer_num,
"rope_theta": rope_theta,
"max_qlen": max_qlen,
"max_kvlen": max_kvlen,
"max_position_embeddings": max_position_embeddings,
"rope_scaling": rope_scaling,
"warm_up_iter": warm_up_iter,
"test_iter": test_iter,
"CPUInfer_parameter": CPUINFER_PARAM
}
"CPUInfer_parameter": CPUINFER_PARAM,
},
}
# 添加 git 与系统信息
result.update(get_git_commit())

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# coding=utf-8
'''
"""
Description :
Author : chenht2022
Date : 2024-07-16 10:43:18
@@ -8,11 +8,12 @@ Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:36:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
hidden_size = 5120
@@ -25,6 +26,7 @@ CPUInfer = kt_kernel_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000
def bench_mlp(quant_mode: str):
with torch.inference_mode(mode=True):
@@ -85,34 +87,47 @@ def bench_mlp(quant_mode: str):
down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
bytes_per_elem = 0.257812
else:
assert(False)
assert False
mlps = []
gate_projs = []
up_projs = []
down_projs = []
for _ in range(layer_num):
gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
config = kt_kernel_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
gate_proj = (
torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
)
up_proj = (
torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
)
down_proj = (
torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
)
config = kt_kernel_ext.mlp.MLPConfig(
hidden_size,
intermediate_size,
stride,
group_max_len,
gate_proj.data_ptr(),
up_proj.data_ptr(),
down_proj.data_ptr(),
gate_type,
up_type,
down_type,
hidden_type,
)
mlp = kt_kernel_ext.mlp.MLP(config)
gate_projs.append(gate_proj)
up_projs.append(up_proj)
down_projs.append(down_proj)
mlps.append(mlp)
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
# warm up
for i in range(warm_up_iter):
CPUInfer.submit(
mlps[i % layer_num].forward(
qlen,
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr()
)
mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
)
CPUInfer.sync()
@@ -120,21 +135,22 @@ def bench_mlp(quant_mode: str):
start = time.perf_counter()
for i in range(test_iter):
CPUInfer.submit(
mlps[i % layer_num].forward(
qlen,
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr()
)
mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
)
CPUInfer.sync()
end = time.perf_counter()
total_time = end - start
print('Quant mode: ', quant_mode)
print('Time(s): ', total_time)
print('Iteration: ', test_iter)
print('Time(us) per iteration: ', total_time / test_iter * 1000000)
print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
print('')
print("Quant mode: ", quant_mode)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000,
"GB/s",
)
print("")
bench_mlp("fp32")
bench_mlp("fp16")

View File

@@ -5,8 +5,8 @@ import json
import subprocess
import platform
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
import kt_kernel_ext
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
import torch
from tqdm import tqdm
@@ -35,7 +35,7 @@ CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, "bench_results "+ ".jsonl")
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")
def get_git_commit():
@@ -76,9 +76,9 @@ def get_system_info():
# 获取 CPU 型号(仅 Linux 支持)
cpu_model = None
if os.path.exists('/proc/cpuinfo'):
if os.path.exists("/proc/cpuinfo"):
try:
with open('/proc/cpuinfo', 'r') as f:
with open("/proc/cpuinfo", "r") as f:
for line in f:
if "model name" in line:
cpu_model = line.split(":", 1)[1].strip()
@@ -89,9 +89,9 @@ def get_system_info():
# 获取内存大小单位GB仅 Linux 支持
mem_total_gb = None
if os.path.exists('/proc/meminfo'):
if os.path.exists("/proc/meminfo"):
try:
with open('/proc/meminfo', 'r') as f:
with open("/proc/meminfo", "r") as f:
for line in f:
if "MemTotal" in line:
mem_kb = float(line.split(":", 1)[1].split()[0])
@@ -194,9 +194,21 @@ def bench_moe(quant_mode: str):
# 构建各层 MoE 模型
moes = []
for _ in tqdm(range(layer_num), desc="Initializing MOEs"):
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
gate_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu")
.to("cpu")
.contiguous()
)
up_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu")
.to("cpu")
.contiguous()
)
down_proj = (
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu")
.to("cpu")
.contiguous()
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
config.pool = CPUInfer.backend_
@@ -217,10 +229,15 @@ def bench_moe(quant_mode: str):
moes.append(moe)
# 生成输入数据
print('Generating data...')
print("Generating data...")
# 专家路由索引与权重,每层一个
gen_iter = 1000
expert_ids = torch.rand(gen_iter * qlen , expert_num, device="cpu").argsort(dim=-1)[:, :num_experts_per_tok].reshape(gen_iter, qlen * num_experts_per_tok).contiguous()
expert_ids = (
torch.rand(gen_iter * qlen, expert_num, device="cpu")
.argsort(dim=-1)[:, :num_experts_per_tok]
.reshape(gen_iter, qlen * num_experts_per_tok)
.contiguous()
)
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
@@ -228,34 +245,34 @@ def bench_moe(quant_mode: str):
qlen_tensor = torch.tensor([qlen], dtype=torch.int32)
# 预热阶段
print('Warming up...')
print("Warming up...")
for i in tqdm(range(warm_up_iter), desc="Warm-up"):
CPUInfer.submit(
moes[i % layer_num].forward_task(
qlen_tensor.data_ptr(),
num_experts_per_tok,
expert_ids[i%gen_iter].data_ptr(),
weights[i%gen_iter].data_ptr(),
expert_ids[i % gen_iter].data_ptr(),
weights[i % gen_iter].data_ptr(),
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
False
False,
)
)
CPUInfer.sync()
# 测试阶段
print('Start testing...')
print("Start testing...")
start = time.perf_counter()
for i in tqdm(range(test_iter), desc="Testing"):
CPUInfer.submit(
moes[i % layer_num].forward_task(
qlen_tensor.data_ptr(),
num_experts_per_tok,
expert_ids[i%gen_iter].data_ptr(),
weights[i%gen_iter].data_ptr(),
expert_ids[i % gen_iter].data_ptr(),
weights[i % gen_iter].data_ptr(),
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
False
False,
)
)
CPUInfer.sync()
@@ -264,17 +281,29 @@ def bench_moe(quant_mode: str):
# 计算性能指标
time_per_iter_us = total_time / test_iter * 1e6
bandwidth = hidden_size * intermediate_size * 3 * num_experts_per_tok * (1/8 * 256 * (1-(31/32)**qlen)) * bytes_per_elem * test_iter / total_time / 1e9 # 单位GB/s
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 # 单位TFLOPS
bandwidth = (
hidden_size
* intermediate_size
* 3
* num_experts_per_tok
* (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
* bytes_per_elem
* test_iter
/ total_time
/ 1e9
) # 单位GB/s
flops = (
hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
) # 单位TFLOPS
# 打印结果
print('Quant mode:', quant_mode)
print('Time(s):', total_time)
print('Iteration:', test_iter)
print('Time(us) per iteration:', time_per_iter_us)
print('Bandwidth:', bandwidth, 'GB/s')
print('TFLOPS:', flops)
print('')
print("Quant mode:", quant_mode)
print("Time(s):", total_time)
print("Iteration:", test_iter)
print("Time(us) per iteration:", time_per_iter_us)
print("Bandwidth:", bandwidth, "GB/s")
print("TFLOPS:", flops)
print("")
# 整理测试结果
result = {
@@ -298,8 +327,8 @@ def bench_moe(quant_mode: str):
"qlen": qlen,
"warm_up_iter": warm_up_iter,
"test_iter": test_iter,
"CPUInfer_parameter": CPUINFER_PARAM
}
"CPUInfer_parameter": CPUINFER_PARAM,
},
}
# 添加 git 与系统信息
result.update(get_git_commit())

View File

@@ -15,7 +15,7 @@ from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import numpy as np
# 测试参数设置

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# coding=utf-8
'''
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
@@ -8,12 +8,13 @@ Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
import os, sys, time, json, subprocess, platform
from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
import kt_kernel_ext
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
import torch
import numpy as np
@@ -30,24 +31,19 @@ warm_up_iter = 1000
test_iter = 5000
k_group_size = 128
physical_to_logical_map = torch.tensor(
data=range(expert_num),
device="cpu",
dtype=torch.int64
).contiguous()
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
# 将 CPUInfer 参数设为变量
# CPUINFER_PARAM = 257
# CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)
worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
worker_config.subpool_numa_map= [0,1]
worker_config.subpool_thread_count = [40,40]
worker_config.subpool_numa_map = [0, 1]
worker_config.subpool_thread_count = [40, 40]
CPUINFER_PARAM = 80
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
def get_git_commit():
"""
获取当前 git 提交记录commit hash 和提交信息),
@@ -87,9 +83,9 @@ def get_system_info():
# 获取 CPU 型号(仅 Linux 支持)
cpu_model = None
if os.path.exists('/proc/cpuinfo'):
if os.path.exists("/proc/cpuinfo"):
try:
with open('/proc/cpuinfo', 'r') as f:
with open("/proc/cpuinfo", "r") as f:
for line in f:
if "model name" in line:
cpu_model = line.split(":", 1)[1].strip()
@@ -100,9 +96,9 @@ def get_system_info():
# 获取内存大小单位GB仅 Linux 支持
mem_total_gb = None
if os.path.exists('/proc/meminfo'):
if os.path.exists("/proc/meminfo"):
try:
with open('/proc/meminfo', 'r') as f:
with open("/proc/meminfo", "r") as f:
for line in f:
if "MemTotal" in line:
mem_kb = float(line.split(":", 1)[1].split()[0])
@@ -130,11 +126,13 @@ def get_system_info():
return info
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, script_name + ".jsonl")
def record_results(result, filename=json_path):
"""
将结果以 JSON 格式追加到文件中
@@ -142,6 +140,7 @@ def record_results(result, filename=json_path):
with open(filename, "a") as f:
f.write(json.dumps(result) + "\n")
def bench_moe(quant_mode: str):
with torch.inference_mode():
if quant_mode == "bf16":
@@ -160,11 +159,22 @@ def bench_moe(quant_mode: str):
up_projs = []
down_projs = []
for layer_index in range(layer_num):
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size,0)
gate_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
up_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
down_proj = (
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
config.max_len = max_len
config.gate_proj = gate_proj.data_ptr()
config.up_proj = up_proj.data_ptr()
@@ -189,10 +199,22 @@ def bench_moe(quant_mode: str):
down_projs.append(down_proj)
moes.append(moe)
gen_iter = 3000
expert_ids = torch.rand(gen_iter * qlen , expert_num, device="cpu").argsort(dim=-1)[:, :num_experts_per_tok].reshape(gen_iter, qlen * num_experts_per_tok).to("cpu").contiguous()
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
expert_ids = (
torch.rand(gen_iter * qlen, expert_num, device="cpu")
.argsort(dim=-1)[:, :num_experts_per_tok]
.reshape(gen_iter, qlen * num_experts_per_tok)
.to("cpu")
.contiguous()
)
weights = (
torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
)
input_tensor = (
torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
)
output_tensor = (
torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
)
bsz_tensor = torch.tensor([qlen], device="cpu")
# 预热迭代
@@ -203,8 +225,8 @@ def bench_moe(quant_mode: str):
moes[i % layer_num].forward_task(
bsz_tensor.data_ptr(),
num_experts_per_tok,
expert_ids[i%gen_iter].data_ptr(),
weights[i%gen_iter].data_ptr(),
expert_ids[i % gen_iter].data_ptr(),
weights[i % gen_iter].data_ptr(),
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
False,
@@ -224,8 +246,8 @@ def bench_moe(quant_mode: str):
moes[i % layer_num].forward_task(
bsz_tensor.data_ptr(),
num_experts_per_tok,
expert_ids[i%gen_iter].data_ptr(),
weights[i%gen_iter].data_ptr(),
expert_ids[i % gen_iter].data_ptr(),
weights[i % gen_iter].data_ptr(),
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
False,
@@ -239,16 +261,28 @@ def bench_moe(quant_mode: str):
# 计算性能指标
time_per_iter_us = total_time / test_iter * 1e6
bandwidth = hidden_size * intermediate_size * 3 * num_experts_per_tok * (1/8 * 256 * (1-(31/32)**qlen)) * bytes_per_elem * test_iter / total_time / 1e9 # 单位GB/s
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 # 单位TFLOPS
bandwidth = (
hidden_size
* intermediate_size
* 3
* num_experts_per_tok
* (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
* bytes_per_elem
* test_iter
/ total_time
/ 1e9
) # 单位GB/s
flops = (
hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
) # 单位TFLOPS
print('Quant mode: ', quant_mode)
print('Time(s): ', total_time)
print('Iteration: ', test_iter)
print('Time(us) per iteration: ', time_per_iter_us)
print('Bandwidth: ', bandwidth, 'GB/s')
print('Flops: ', flops, 'TFLOPS')
print('')
print("Quant mode: ", quant_mode)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", time_per_iter_us)
print("Bandwidth: ", bandwidth, "GB/s")
print("Flops: ", flops, "TFLOPS")
print("")
# 整理结果记录,包括测试参数
result = {
@@ -270,8 +304,8 @@ def bench_moe(quant_mode: str):
"warm_up_iter": warm_up_iter,
"test_iter": test_iter,
"CPUInfer_parameter": CPUINFER_PARAM,
"k_group_size": k_group_size
}
"k_group_size": k_group_size,
},
}
# 添加 git 提交记录信息
result.update(get_git_commit())
@@ -280,6 +314,7 @@ def bench_moe(quant_mode: str):
# 将结果以 JSON 形式追加到文件中
record_results(result)
if __name__ == "__main__":
# 选择需要测试的量化模式
# bench_moe("bf16")

View File

@@ -14,7 +14,7 @@ import os, sys, time, json, subprocess, platform
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import numpy as np
from tqdm import tqdm

View File

@@ -26,7 +26,7 @@ os.environ.setdefault("BLAS_NUM_THREADS", "1")
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import torch # noqa: E402
import kt_kernel_ext as ce # noqa: E402
from kt_kernel import kt_kernel_ext as ce # noqa: E402
from tqdm import tqdm # noqa: E402

View File

@@ -13,7 +13,7 @@ import os, sys, time, json, subprocess, platform
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
import numpy as np
from tqdm import tqdm

View File

@@ -1,9 +1,10 @@
import os
import sys
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import torch
import ctypes
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.moe import MOEConfig, MOE, AMXBF16_MOE, AMXInt8_MOE, AMXInt4_MOE, AMXInt4_1_MOE
intermediate_size_full = 2048
@@ -19,15 +20,9 @@ gate = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch
down = torch.empty(experts_num, hidden_size, intermediate_size_full, dtype=torch.bfloat16, device="cpu")
gate_ptr = ctypes.addressof(
ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
)
up_ptr = ctypes.addressof(
ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
)
down_ptr = ctypes.addressof(
ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
)
gate_ptr = ctypes.addressof(ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
up_ptr = ctypes.addressof(ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
down_ptr = ctypes.addressof(ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
moe_config = MOEConfig(
experts_num,
num_experts_per_tok,
@@ -36,7 +31,7 @@ moe_config = MOEConfig(
)
moe_config.layer_idx = 45
moe_config.pool = cpu_infer.backend_
moe_config.max_len = 1024 #TODO(zbx): multi cuda graph
moe_config.max_len = 1024 # TODO(zbx): multi cuda graph
moe_config.gate_proj = gate_ptr
moe_config.up_proj = up_ptr
moe_config.down_proj = down_ptr

View File

@@ -13,7 +13,7 @@ import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from flash_attn import flash_attn_with_kvcache
import torch
@@ -59,19 +59,11 @@ with torch.inference_mode(mode=True):
local_kvcache = kt_kernel_ext.kvcache.KVCache(config)
kvcaches = []
block_table = (
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
.contiguous()
.view(1, -1)
)
block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1)
for layer_idx in range(layer_num):
k_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
v_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
k_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
v_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
CPUInfer.submit(
local_kvcache.update_kvcache_fp16(
@@ -94,17 +86,11 @@ with torch.inference_mode(mode=True):
k_cache = kvcaches[i % layer_num][0]
v_cache = kvcaches[i % layer_num][1]
input = torch.randn(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
output = torch.empty(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
# attn_lse: (bsz, q_len, q_head_num)
attn_lse = torch.empty(
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
).contiguous()
attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous()
input = input / 100
CPUInfer.submit(
@@ -135,8 +121,6 @@ with torch.inference_mode(mode=True):
)
# print("torch output", t_output)
diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
torch.abs(t_output)
)
diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(torch.abs(t_output))
print("diff = ", diff)
assert diff < 0.001

View File

@@ -2,7 +2,7 @@ import os, sys
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
# Set fixed seed for reproducible results

View File

@@ -1,8 +1,9 @@
import os, sys
import time
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
@@ -20,6 +21,7 @@ from transformers import (
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
# load_layers = 6
load_layers = None
CPUInfer = kt_kernel_ext.CPUInfer(304)
@@ -284,15 +286,15 @@ def build_moegate(layer_idx, json_config, gguf_weights):
json_config["topk_group"],
)
config.routed_scaling_factor = json_config['routed_scaling_factor']
config.routed_scaling_factor = json_config["routed_scaling_factor"]
config.pool = CPUInfer.backend_
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
config.weight = weight.data_ptr()
config.weight_type = type_to_ggml_type(weight_type)
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
config.e_score_correction_bias = bias.data_ptr()
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
@@ -301,7 +303,6 @@ def build_moegate(layer_idx, json_config, gguf_weights):
return gate
def build_llm(json_config, gguf_weights):
general_config = kt_kernel_ext.GeneralConfig()
@@ -312,7 +313,7 @@ def build_llm(json_config, gguf_weights):
general_config.n_shared_experts = json_config["n_shared_experts"]
general_config.max_qlen = max_qlen
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
general_config.lm_heads_ptr = lm_heads.data_ptr()
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
@@ -320,7 +321,7 @@ def build_llm(json_config, gguf_weights):
general_config.norm_weights_ptr = output_norm.data_ptr()
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
general_config.token_embd_ptr = token_embd.data_ptr()
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
@@ -330,12 +331,11 @@ def build_llm(json_config, gguf_weights):
model = kt_kernel_ext.DeepseekV3Model(general_config)
llm.model = model
decoder_layers = []
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
for i in range(real_load_layers):
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
@@ -355,7 +355,7 @@ def build_llm(json_config, gguf_weights):
return llm
safetensor_path = '/home/bd/models/DeepSeek-R1'
safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)
@@ -389,7 +389,7 @@ config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
force_think = False
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
def start_chat(content=None):
@@ -397,14 +397,12 @@ def start_chat(content=None):
content = input("Chat: ")
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
if force_think:
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
input_tensor = torch.cat(
[input_tensor, token_thinks], dim=1
token_thinks = torch.tensor(
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
)
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
input_tensor = input_tensor.squeeze(0) # Add batch dimension
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
@@ -415,13 +413,15 @@ def start_chat(content=None):
stream = TextStreamer(tokenizer)
qlen = input_tensor.shape[0]
qlens = [qlen-kvlen]
qlens = [qlen - kvlen]
kvlens = [kvlen]
page_tables = [list(range(pages_count))]
start_time = time.perf_counter()
llm.forward(qlens,page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
llm.forward(qlens, page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
end_time = time.perf_counter()
print(f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec")
print(
f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
)
logits = output_logits[0]
# print(logits)
@@ -431,18 +431,18 @@ def start_chat(content=None):
kvlen = input_tensor.shape[0]
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
stream.end()
break
else:
stream.put(torch.tensor([next_token]))
job_id = 0
while True:
try:
# ---------- 让用户决定是否继续 ----------
choice = input(
"\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: "
).strip().lower()
choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower()
if choice in {"q", "quit", "exit"}:
print("收到退出指令,程序结束。")
break
@@ -467,14 +467,3 @@ while True:
logger.error(f"Error in job {job_id}: {e}", exc_info=True)
finally:
job_id += 1 # 不管中断与否,都给下一任务换编号

View File

@@ -1,8 +1,9 @@
import os, sys
import time
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
@@ -189,7 +190,6 @@ def build_mla(layer_idx, json_config, gguf_weights):
config.pool = CPUInfer.backend_
config.page_count = pages_count
if q_a_type == "F32":
mla = kt_kernel_ext.mla.MLA_F32(config)
elif q_a_type == "F16":
@@ -284,15 +284,15 @@ def build_moegate(layer_idx, json_config, gguf_weights):
json_config["topk_group"],
)
config.routed_scaling_factor = json_config['routed_scaling_factor']
config.routed_scaling_factor = json_config["routed_scaling_factor"]
config.pool = CPUInfer.backend_
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
config.weight = weight.data_ptr()
config.weight_type = type_to_ggml_type(weight_type)
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
config.e_score_correction_bias = bias.data_ptr()
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
@@ -301,7 +301,6 @@ def build_moegate(layer_idx, json_config, gguf_weights):
return gate
def build_llm(json_config, gguf_weights):
general_config = kt_kernel_ext.GeneralConfig()
@@ -312,7 +311,7 @@ def build_llm(json_config, gguf_weights):
general_config.n_shared_experts = json_config["n_shared_experts"]
general_config.max_qlen = max_qlen
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
general_config.lm_heads_ptr = lm_heads.data_ptr()
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
@@ -320,7 +319,7 @@ def build_llm(json_config, gguf_weights):
general_config.norm_weights_ptr = output_norm.data_ptr()
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
general_config.token_embd_ptr = token_embd.data_ptr()
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
@@ -330,12 +329,11 @@ def build_llm(json_config, gguf_weights):
model = kt_kernel_ext.DeepseekV3Model(general_config)
llm.model = model
decoder_layers = []
for i in range(json_config["num_hidden_layers"]):
# for i in range(6):
# for i in [0,1,2,3,4,5,6,7,8,9,10]:
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
@@ -355,7 +353,7 @@ def build_llm(json_config, gguf_weights):
return llm
safetensor_path = '/home/bd/models/DeepSeek-R1'
safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)
@@ -384,7 +382,7 @@ prompt_file = None
force_think = False
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
def start_chat():
@@ -413,14 +411,12 @@ def start_chat():
content = open(content, "r").read()
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
if force_think:
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
input_tensor = torch.cat(
[input_tensor, token_thinks], dim=1
token_thinks = torch.tensor(
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
)
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
input_tensor = input_tensor.squeeze(0) # Add batch dimension
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
@@ -431,7 +427,7 @@ def start_chat():
qlens = [qlen]
kvlens = [0]
page_tables = [list(range(pages_count))]
llm.forward(qlens,page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
logits = output_logits[0]
# print(logits)
@@ -440,19 +436,18 @@ def start_chat():
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
print(stream.end(), end="", flush=True)
break
else:
print(stream.put(torch.tensor([next_token])), end="", flush=True)
job_id = 0
while True:
try:
# ---------- 让用户决定是否继续 ----------
choice = input(
"\n【回车】开始对话 | 输入 q/quit/exit 退出程序: "
).strip().lower()
choice = input("\n【回车】开始对话 | 输入 q/quit/exit 退出程序: ").strip().lower()
if choice in {"q", "quit", "exit"}:
print("收到退出指令,程序结束。")
break
@@ -465,14 +460,3 @@ while True:
print(f"\n检测到 Ctrl-C已终止对话 #{job_id},马上重启…")
finally:
job_id += 1 # 不管中断与否,都给下一任务换编号

View File

@@ -1,8 +1,9 @@
import os, sys
import time
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
@@ -20,12 +21,13 @@ from transformers import (
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
# load_layers = 3
load_layers = None
worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
worker_config.subpool_numa_map= [0,1]
worker_config.subpool_thread_count = [72,72]
worker_config.subpool_numa_map = [0, 1]
worker_config.subpool_thread_count = [72, 72]
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
max_qlen = 4096
@@ -289,15 +291,15 @@ def build_moegate(layer_idx, json_config, gguf_weights):
json_config["topk_group"],
)
config.routed_scaling_factor = json_config['routed_scaling_factor']
config.routed_scaling_factor = json_config["routed_scaling_factor"]
config.pool = CPUInfer.backend_
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
config.weight = weight.data_ptr()
config.weight_type = type_to_ggml_type(weight_type)
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
config.e_score_correction_bias = bias.data_ptr()
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
@@ -306,7 +308,6 @@ def build_moegate(layer_idx, json_config, gguf_weights):
return gate
def build_llm(json_config, gguf_weights):
general_config = kt_kernel_ext.GeneralConfig()
@@ -317,7 +318,7 @@ def build_llm(json_config, gguf_weights):
general_config.n_shared_experts = json_config["n_shared_experts"]
general_config.max_qlen = max_qlen
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
general_config.lm_heads_ptr = lm_heads.data_ptr()
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
@@ -325,7 +326,7 @@ def build_llm(json_config, gguf_weights):
general_config.norm_weights_ptr = output_norm.data_ptr()
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
general_config.token_embd_ptr = token_embd.data_ptr()
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
@@ -335,13 +336,12 @@ def build_llm(json_config, gguf_weights):
model = kt_kernel_ext.DeepseekV3Model(general_config)
llm.model = model
decoder_layers = []
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
for i in range(real_load_layers):
# for i in [2,3]:
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
@@ -361,7 +361,7 @@ def build_llm(json_config, gguf_weights):
return llm
safetensor_path = '/home/bd/models/DeepSeek-R1'
safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)
@@ -372,11 +372,11 @@ weights = dict(sorted(weights.items()))
# for name, t in weights.items():
# if not name.startswith("blk"):
# if name.startswith("blk.10."):
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
# if not name.startswith("blk"):
# if name.startswith("blk.10."):
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print("Building LLM ...")
load_start_time = time.perf_counter()
@@ -395,7 +395,7 @@ config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
force_think = False
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
def start_chat(content=None):
@@ -403,14 +403,12 @@ def start_chat(content=None):
content = input("Chat: ")
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
if force_think:
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
input_tensor = torch.cat(
[input_tensor, token_thinks], dim=1
token_thinks = torch.tensor(
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
)
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
input_tensor = input_tensor.squeeze(0) # Add batch dimension
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
@@ -425,9 +423,11 @@ def start_chat(content=None):
kvlens = [0]
page_tables = [list(range(pages_count))]
start_time = time.perf_counter()
llm.forward(qlens,page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
end_time = time.perf_counter()
print(f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec")
print(
f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
)
logits = output_logits[0]
# print(logits)
@@ -437,18 +437,18 @@ def start_chat(content=None):
# kvlen = input_tensor.shape[0]
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
stream.end()
break
else:
stream.put(torch.tensor([next_token]))
job_id = 0
while True:
try:
# ---------- 让用户决定是否继续 ----------
choice = input(
"\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: "
).strip().lower()
choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower()
if choice in {"q", "quit", "exit"}:
print("收到退出指令,程序结束。")
break
@@ -473,14 +473,3 @@ while True:
logger.error(f"Error in job {job_id}: {e}", exc_info=True)
finally:
job_id += 1 # 不管中断与否,都给下一任务换编号

View File

@@ -1,15 +1,17 @@
import math
import os,sys
import os, sys
import time
from typing import Optional
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import nn
import torch.nn.functional as F
# from modeling_deepseek_v3 import MoEGate
from configuration_deepseek_v3 import DeepseekV3Config
@@ -28,17 +30,20 @@ n_group = config.n_group
topk_group = config.topk_group
routed_scaling_factor = config.routed_scaling_factor
weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to('cpu').contiguous()
bias = torch.randn((n_routed_experts,), dtype=torch.float32).to('cpu').contiguous()
weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to("cpu").contiguous()
bias = torch.randn((n_routed_experts,), dtype=torch.float32).to("cpu").contiguous()
# weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float16).to('cpu').contiguous ()
def load_fp32_tensor(file_path, shape):
return torch.zeros(shape, dtype=torch.float32).to('cpu').contiguous()
with open(file_path, 'rb') as f:
return torch.zeros(shape, dtype=torch.float32).to("cpu").contiguous()
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
class MoEGate(nn.Module):
def __init__(self, config):
super().__init__()
@@ -54,13 +59,9 @@ class MoEGate(nn.Module):
# topk selection algorithm
self.norm_topk_prob = config.norm_topk_prob
self.gating_dim = config.hidden_size
self.weight = nn.Parameter(
torch.empty((self.n_routed_experts, self.gating_dim))
)
self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
if self.topk_method == "noaux_tc":
self.e_score_correction_bias = nn.Parameter(
torch.empty((self.n_routed_experts))
)
self.e_score_correction_bias = nn.Parameter(torch.empty((self.n_routed_experts)))
self.reset_parameters()
def reset_parameters(self) -> None:
@@ -73,82 +74,78 @@ class MoEGate(nn.Module):
### compute gating score
hidden_states = hidden_states.view(-1, h)
h_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input',(seq_len,h))
h_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input", (seq_len, h)
)
diff = (h_to_check - hidden_states).abs().max()
# print("hidden_states diff:", diff)
# assert diff<0.02
bias_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias',(n_routed_experts))
bias_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias", (n_routed_experts)
)
diff = (bias - bias_to_check).abs().max()
# print('bias diff:',diff)
# assert diff < 0.02
logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
logits = F.linear(
hidden_states.type(torch.float32), self.weight.type(torch.float32), None
logits_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits",
(seq_len, n_routed_experts),
)
logits_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits',(seq_len,n_routed_experts))
diff = (logits_to_check - logits).abs().max()
# print("logits diff:", diff)
# assert diff < 0.02
if self.scoring_func == "sigmoid":
scores = logits.sigmoid()
else:
raise NotImplementedError(
f"insupportable scoring function for MoE gating: {self.scoring_func}"
)
raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")
### select top-k experts
if self.topk_method == "noaux_tc":
# assert not self.training
scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice',(seq_len,n_routed_experts))
scores_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice",
(seq_len, n_routed_experts),
)
diff = (scores_for_choice - scores_to_check).abs().max()
print(f'score for choice diff = {diff}')
print(f"score for choice diff = {diff}")
group_scores = (
scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
) # [n, n_group]
group_scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores',(seq_len,n_group))
group_scores_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores",
(seq_len, n_group),
)
diff = (group_scores - group_scores_to_check).abs().max()
print(f'group scores diff = {diff}')
print(f"group scores diff = {diff}")
group_idx = torch.topk(
group_scores, k=self.topk_group, dim=-1, sorted=False
)[
1
] # [n, top_k_group]
group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] # [n, top_k_group]
group_mask = torch.zeros_like(group_scores) # [n, n_group]
group_mask.scatter_(1, group_idx, 1) # [n, n_group]
score_mask = (
group_mask.unsqueeze(-1)
.expand(
bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
)
.expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
.reshape(bsz * seq_len, -1)
) # [n, e]
tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e]
tmp_scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped',(seq_len,n_routed_experts))
is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True)
print(f'tmp_score ok {is_close.all()}')
_, topk_idx = torch.topk(
tmp_scores, k=self.top_k, dim=-1, sorted=False
tmp_scores_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped",
(seq_len, n_routed_experts),
)
is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True)
print(f"tmp_score ok {is_close.all()}")
_, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
topk_weight = scores.gather(1, topk_idx)
else:
raise NotImplementedError(
f"insupportable TopK function for MoE gating: {self.topk_method}"
)
raise NotImplementedError(f"insupportable TopK function for MoE gating: {self.topk_method}")
### norm gate to sum 1
if self.top_k > 1 and self.norm_topk_prob:
@@ -159,7 +156,6 @@ class MoEGate(nn.Module):
return topk_idx, topk_weight
def torch_gate(hidden_states):
hidden_states.unsqueeze_(0)
gate = MoEGate(config)
@@ -190,30 +186,27 @@ def cpuinfer_gate(hidden_states):
gate = kt_kernel_ext.gate.MoEGate(config)
expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to("cpu").contiguous()
expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to("cpu").contiguous()
expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to('cpu').contiguous()
expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to('cpu').contiguous()
gate.forward(seqlen,hidden_states.data_ptr(),expert_ids.data_ptr(), expert_weights.data_ptr())
gate.forward(seqlen, hidden_states.data_ptr(), expert_ids.data_ptr(), expert_weights.data_ptr())
# print(expert_ids,expert_weights)
return expert_ids, expert_weights
input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to('cpu').contiguous()
input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to("cpu").contiguous()
# print(input)
ids,we = cpuinfer_gate(input)
ids, we = cpuinfer_gate(input)
idx = torch.argsort(ids, dim=-1, descending=True)
ids = torch.gather(ids,dim=-1,index=idx)
we = torch.gather(we,dim=-1,index=idx)
ids = torch.gather(ids, dim=-1, index=idx)
we = torch.gather(we, dim=-1, index=idx)
std_ids,std_we= torch_gate(input)
std_ids, std_we = torch_gate(input)
idx = torch.argsort(std_ids, dim=-1, descending=True)
std_we = torch.gather(std_we,dim=-1,index=idx)
std_ids = torch.gather(std_ids,dim=-1,index=idx)
std_we = torch.gather(std_we, dim=-1, index=idx)
std_ids = torch.gather(std_ids, dim=-1, index=idx)
# print("ids diff:", torch.abs(std_ids - ids).max())
@@ -221,28 +214,3 @@ std_ids = torch.gather(std_ids,dim=-1,index=idx)
assert torch.abs(std_ids - ids).max() == 0, "Expert IDs do not match!"
assert torch.abs(std_we - we).max() < 1e-2, "Expert Weights do not match!"
print("Expert IDs and Weights match successfully!")

View File

@@ -6,7 +6,7 @@ from typing import Dict, Literal
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
torch.manual_seed(42)
@@ -132,6 +132,7 @@ def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: Literal[0, 1]
return packed
def pack_tensor_per_row(q: torch.Tensor, num_bits: int) -> torch.Tensor:
e, rows, cols = q.shape
flat = q.view(e * rows, cols)
@@ -283,9 +284,9 @@ def run_case(pattern: str) -> Dict[str, float]:
CPUInfer.sync()
input_tensor_fp16 = input_tensor.to(torch.float16)
t_output = moe_torch(
input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16
).to(torch.bfloat16)
t_output = moe_torch(input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16).to(
torch.bfloat16
)
t_output = t_output.flatten()
output = output.flatten()

View File

@@ -11,7 +11,7 @@ import numpy as np
# if REPO_ROOT not in sys.path:
# sys.path.insert(0, REPO_ROOT)
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext import CPUInfer
@@ -89,9 +89,7 @@ def main():
moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(cfg)
physical_to_logical_map = (
torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
)
physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
cpuinfer.sync()
@@ -169,6 +167,7 @@ def main():
total_bytes = total_weights // group_size + total_weights // 2
print(f"write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms")
print(f"Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s")
def split_expert_tensor(tensor, chunk):
"""Split tensor by experts"""
return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)]
@@ -229,10 +228,10 @@ def main():
tp_scale_offset = col_scale_start + tp_idx * tp_slice_scale_size
down_weight_tp_parts.append(
down_q_experts[expert_idx][tp_weight_offset:tp_weight_offset + tp_slice_weight_size]
down_q_experts[expert_idx][tp_weight_offset : tp_weight_offset + tp_slice_weight_size]
)
down_scale_tp_parts.append(
down_scale_experts[expert_idx][tp_scale_offset:tp_scale_offset + tp_slice_scale_size]
down_scale_experts[expert_idx][tp_scale_offset : tp_scale_offset + tp_slice_scale_size]
)
# Concatenate all column slices for this TP
@@ -260,7 +259,9 @@ def main():
assert torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight), f"w2 weight bytes mismatch for TP {tp_idx}"
assert torch.allclose(w2_scale_bufs[tp_idx], expected_w2_scale), f"w2 scale values mismatch for TP {tp_idx}"
print(f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts")
print(
f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts"
)
if __name__ == "__main__":

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# coding=utf-8
'''
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
@@ -8,11 +8,12 @@ Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:36:59
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
input_size = 16384
@@ -30,8 +31,10 @@ with torch.inference_mode(mode=True):
linears = []
projs = []
for _ in range(layer_num):
proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
config = kt_kernel_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
proj = torch.randn((output_size, input_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
config = kt_kernel_ext.linear.LinearConfig(
input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type
)
linear = kt_kernel_ext.linear.Linear(config)
projs.append(proj)
linears.append(linear)
@@ -43,20 +46,14 @@ with torch.inference_mode(mode=True):
output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous()
input = input / 100
CPUInfer.submit(
linear.forward(
qlen,
input.data_ptr(),
output.data_ptr()
)
)
CPUInfer.submit(linear.forward(qlen, input.data_ptr(), output.data_ptr()))
CPUInfer.sync()
# print('cpuinfer output', output)
proj = projs[i%layer_num]
proj = projs[i % layer_num]
t_output = torch.mm(input, proj.t())
# print('torch output', t_output)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print('diff = ', diff)
assert(diff < 0.001)
print("diff = ", diff)
assert diff < 0.001

View File

@@ -1,19 +1,22 @@
import logging
import os,sys
import os, sys
import time
from typing import Optional
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
def read_gguf_file(gguf_file_path):
"""
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
@@ -46,12 +49,15 @@ def read_gguf_file(gguf_file_path):
re.append(tensor)
return re
def get_torch_tensor_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous()
def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name
def type_to_ggml_type(type):
if type == "F32":
return ggml_type.FP32
@@ -75,7 +81,7 @@ kvlen = 0
page_table = range(20)
bsz_tensors=torch.tensor([1])
bsz_tensors = torch.tensor([1])
page_size = 256
@@ -97,7 +103,6 @@ max_kvlen = 4096
max_position_embeddings = 163840
rope_scaling = {
"beta_fast": 32,
"beta_slow": 1,
@@ -105,11 +110,10 @@ rope_scaling = {
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
"type": "yarn",
}
CPUInfer = kt_kernel_ext.CPUInfer(30)
validation_iter = 100
@@ -119,15 +123,16 @@ weight_type = torch.bfloat16
# weight_type = torch.float16
input_type = {torch.float32:torch.float32,
torch.float16:torch.float16,
torch.bfloat16:torch.float32,
}[weight_type]
input_type = {
torch.float32: torch.float32,
torch.float16: torch.float16,
torch.bfloat16: torch.float32,
}[weight_type]
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type)
q_a_norm = torch.ones(hidden_size, dtype=torch.float32)
kv_a_norm = torch.ones(hidden_size, dtype=torch.float32)
@@ -203,16 +208,16 @@ q_absorb = x_reshaped[:, 0]
out_absorb = x_reshaped[:, 1]
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
def test_cpu_mla():
os.environ["BLAS_NUM_THREADS"] = "1"
q_a_proj_weight = q_a_proj.weight.to(weight_type).to('cpu').contiguous()
q_b_proj_weight = q_b_proj.weight.to(weight_type).to('cpu').contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(weight_type).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to('cpu').contiguous()
o_proj_weight = o_proj.weight.to(weight_type).to('cpu').contiguous()
q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous()
config = kt_kernel_ext.mla.MLAConfig(
hidden_size,
@@ -245,7 +250,6 @@ def test_cpu_mla():
config.kv_a_norm_type = ggml_type.FP32
config.page_count = pages_count
if weight_type == torch.float32:
config.q_a_proj_type = ggml_type.FP32
config.q_b_proj_type = ggml_type.FP32
@@ -267,10 +271,8 @@ def test_cpu_mla():
else:
raise ValueError(f"Unsupported data type: {weight_type}")
config.pool = CPUInfer.backend_
if weight_type == torch.float32:
mla = kt_kernel_ext.mla.MLA_F32(config)
elif weight_type == torch.float16:
@@ -284,50 +286,49 @@ def test_cpu_mla():
mla.load_weights()
mla.set_local_pages(pages_count)
output = torch.zeros((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
mla.forward([qlen],[page_table],[kvlen],hidden_states.data_ptr(),output.data_ptr())
print("CPU MLA Output: ",output)
output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr())
print("CPU MLA Output: ", output)
return output
def load_fp16_tensor(file_path, shape):
# return load_fp32_tensor(file_path, shape)
return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=weight_type)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
def load_fp32_tensor(file_path, shape):
return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
def test_torch():
torch.set_grad_enabled(False)
softmax_scale = (nope_size + rope_size) ** -0.5
# 1代表的是压缩的kv的头数
k_caches = torch.randn(1,pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type)
k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type)
kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)
q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)
q_a_layernorm.weight = nn.Parameter( q_a_norm,requires_grad=False)
q_a_layernorm.weight = nn.Parameter(q_a_norm, requires_grad=False)
x = torch.randn(q_lora_rank, dtype=weight_type)*100
x = torch.randn(q_lora_rank, dtype=weight_type) * 100
print(x)
print(q_a_layernorm(x))
kv_a_layernorm = DeepseekV2RMSNorm(kv_lora_rank)
kv_a_layernorm.weight = nn.Parameter(kv_a_norm, requires_grad=False)
# 第三步:拆分成两个 tensor
# q_absorb, out_absorb = x_permuted[:, 0], x_permuted[:, 1] # 都是 (num_heads, nope_size, kv_lora_rank
# q_absorb = kv_b_proj[:, ] # torch.randn(num_heads, nope_size, kv_lora_rank, dtype=data_type)
@@ -348,25 +349,24 @@ def test_torch():
# kv_indices 是[0:bsz]page_idx=[0:bsz], page_offset=[kvlen:qlen+kvlen]
# last_page_len = [qlen+kvlen,...] layer_idx = 1
# position_ids = [kvlen:qlen+kvlen]
q_indptr = torch.tensor([0,qlen]).to(torch.int32)
q_indptr = torch.tensor([0, qlen]).to(torch.int32)
kv_indptr = torch.tensor([0,(qlen+kvlen+page_size-1)//page_size]).to(torch.int32)
kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32)
kv_indices = torch.tensor(range(pages_count)).to(torch.int32)
page_idx = torch.tensor([i//page_size for i in range(kvlen,kvlen+qlen)] ).to(torch.int32)
page_offset = torch.tensor( [i%page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
last_page_len = torch.tensor([256], device=hidden_states.device)
position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32)
# 按照行创建 mask [qlen,kvlen+qlen]
attention_masks = torch.zeros((max_qlen, max_kvlen), dtype=weight_type)
for i in range(max_qlen):
attention_masks[i, i + kvlen + 1:] = -inf
attention_masks[i, i + kvlen + 1 :] = -inf
def torch_attn(hidden_states_i: torch.Tensor,
def torch_attn(
hidden_states_i: torch.Tensor,
kv_cache: KDeepSeekV3Cache,
position_ids: torch.Tensor,
page_idx: torch.Tensor,
@@ -385,28 +385,28 @@ def test_torch():
# range bsz_tensors
final_attention_output = torch.tensor([], device=hidden_states.device)
for i in range(bsz_tensors[0]):
batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i]
batch_last_page_len = last_page_len[i]
# kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]]
batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]]
# kv_page_nums is the number of pages for the current batch
kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
kv_page_nums = kv_indptr[i + 1] - kv_indptr[i]
# kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
kv_total_len = kv_page_nums * page_size
if batch_last_page_len is not None:
kv_total_len = kv_total_len - (page_size - batch_last_page_len)
# print(f"kv_total_len's shape {kv_total_len.shape}")
# kv_index is the index of the kv cache pages for the current batch
kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]]
# we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
# from q_indptr[i] to q_indptr[i+1] is the range of the current batch
batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]]
batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]]
qlen, _ = batch_hidden_states.size()
# print("qlen -> ", qlen)
hidden_states_to_check = load_fp16_tensor('./debug/query_0_tp_0_input.bin',batch_hidden_states.shape)
hidden_states_to_check = load_fp16_tensor("./debug/query_0_tp_0_input.bin", batch_hidden_states.shape)
diff = torch.abs(batch_hidden_states - hidden_states_to_check).max()
print("hidden_states diff -> ", diff)
@@ -422,8 +422,6 @@ def test_torch():
# print("q_lora mae -> ", mae)
# print("q_lora mae test -> ", mae_test)
q_lora_norm = q_a_layernorm(q_lora)
# q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
# q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
@@ -438,22 +436,17 @@ def test_torch():
q = q_b_proj(q_lora_norm)
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
q = q.view(qlen, num_heads, nope_size+rope_size)
q = q.view(qlen, num_heads, nope_size + rope_size)
# q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
# q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
q_nope, q_pe = torch.split(
q, [nope_size, rope_size], dim=-1
)
q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1)
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
compressed_kv, k_pe = torch.split(
compressed_kv, [kv_lora_rank, rope_size], dim=-1
)
compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1)
compressed_kv = compressed_kv.contiguous()
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
# compressed_kv_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_kv_lora_rank',
# compressed_kv_page_0.shape)
@@ -473,9 +466,6 @@ def test_torch():
# print("compressed_kv diff norm -> ", diff)
# print("compressed_kv mae norm -> ", mae)
k_pe = k_pe.view(qlen, 1, rope_size)
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
@@ -495,7 +485,7 @@ def test_torch():
# print("q_nope[0] diff test -> ", diff_test)
# print("q_nope[0] mae test -> ", mae_test)
q_pe_nope = q_pe.transpose(0,1)
q_pe_nope = q_pe.transpose(0, 1)
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe_nope[0].shape)
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope', q_pe_nope[0].shape)
# q_pe_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope_test', q_pe_nope[0].shape)
@@ -539,7 +529,6 @@ def test_torch():
# diff = torch.abs(q_pe - q_new).max()
# print("q_pe diff -> ", diff)
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
# diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
# mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
@@ -552,10 +541,17 @@ def test_torch():
# print("q_pe[0] 2 mae -> ", mae)
if kv_cache is not None:
cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models
compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
cache_kwargs = {
"sin": sin,
"cos": cos,
"page_idx": batch_page_idx,
"page_offset": batch_page_offset,
} # Specific to RoPE models
compressed_kv_with_k_pe = kv_cache.update(
compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs
)
compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
# q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
# out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
# q_absorb, out_absorb = get_absorbed()
@@ -611,21 +607,20 @@ def test_torch():
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
# k_pe_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_k_rope', (256,64))
# diff = torch.abs(batch_k_pe[:256] - k_pe_to_check).max()
# mae = torch.mean(torch.abs(batch_k_pe[:256] - k_pe_to_check))
# print("k_pe diff -> ", diff)
# print("k_pe mae -> ", mae)
pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
pe_weights = torch.matmul(q_pe, batch_k_pe.mT)
kv_total_len = kv_page_nums * page_size
# pe_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_pe_attention_weights', (1024,4096))
# pe_weights_0 = pe_weights_0[0:qlen, 0:kv_total_len]
# diff = torch.abs(pe_weights[0] - pe_weights_0).max()
# print("pe_weights[0] diff -> ", diff)
attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))
attention_weights = pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)
# raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096))
# raw_weights = raw_weights[0:qlen, 0:kv_total_len]
@@ -641,18 +636,18 @@ def test_torch():
print(attention_weights.shape)
print(attention_masks.shape)
attention_weights = (attention_weights + attention_masks[ :attention_weights.shape[1],:attention_weights.shape[2]])
attention_weights = (
attention_weights + attention_masks[: attention_weights.shape[1], : attention_weights.shape[2]]
)
# attention_weights shape is [num_heads(128), qlen, k_len]
attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype)
attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=weight_type).to(q_pe.dtype)
# attention_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_weights', (1024, 4096))
# attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
# diff = torch.abs(attention_weights[0] - attention_weights_0).max()
# print("attention_weights[0] diff -> ", diff)
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
@@ -671,8 +666,8 @@ def test_torch():
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
attn_output = attn_output.reshape(qlen, num_heads * nope_size)
w_o = o_proj.weight.view([hidden_size,num_heads * nope_size])
output = torch.matmul(attn_output,w_o.transpose(0,1))
w_o = o_proj.weight.view([hidden_size, num_heads * nope_size])
output = torch.matmul(attn_output, w_o.transpose(0, 1))
output = output.view(qlen, hidden_size)
# output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size))
@@ -685,18 +680,14 @@ def test_torch():
# diff = torch.abs(h1_output - output_0_check).max()
# print("h1_output diff -> ", diff)
# output_check = load_fp16_tensor('./debug/output.bin', output.shape)
# diff = torch.abs(output - output_check).max()
# mae = torch.mean(torch.abs(output - output_check))
# print("output diff -> ", diff)
final_attention_output = torch.cat((final_attention_output, output), dim=0)
return final_attention_output
torch_output = torch_attn(
hidden_states,
kv_cache,
@@ -709,11 +700,12 @@ def test_torch():
kv_indptr=kv_indptr,
bsz_tensors=bsz_tensors,
last_page_len=last_page_len,
layer_idx=0
layer_idx=0,
)
print("Torch Output: ",torch_output)
print("Torch Output: ", torch_output)
return torch_output
torch.set_printoptions(sci_mode=False, precision=5)
output_cpu = test_cpu_mla()
output_torch = test_torch()
@@ -724,11 +716,9 @@ diff = (output_cpu - output_torch).abs()
diff_relative = diff / (output_cpu.abs())
# 把 diff_relative 中的 NaN 替换为 0
diff_relative = torch.where(torch.isnan(diff_relative), torch.zeros_like(diff_relative), diff_relative)
diff_relative_mean = torch.mean(torch.abs(output_cpu-output_torch)) / torch.mean(torch.abs(output_torch))
diff_relative_mean = torch.mean(torch.abs(output_cpu - output_torch)) / torch.mean(torch.abs(output_torch))
print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}')
print(
f"Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}"
)
assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"

View File

@@ -1,19 +1,22 @@
import logging
import os,sys
import os, sys
import time
from typing import Optional
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
def read_gguf_file(gguf_file_path):
"""
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
@@ -46,12 +49,15 @@ def read_gguf_file(gguf_file_path):
re.append(tensor)
return re
def get_torch_tensor_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous()
def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name
def type_to_ggml_type(type):
if type == "F32":
return ggml_type.FP32
@@ -75,7 +81,7 @@ kvlen = 0
page_table = range(20)
bsz_tensors=torch.tensor([1])
bsz_tensors = torch.tensor([1])
page_size = 256
@@ -97,7 +103,6 @@ max_kvlen = 4096
max_position_embeddings = 163840
rope_scaling = {
"beta_fast": 32,
"beta_slow": 1,
@@ -105,11 +110,10 @@ rope_scaling = {
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
"type": "yarn",
}
CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 100
@@ -119,15 +123,16 @@ weight_type = torch.bfloat16
# weight_type = torch.float16
input_type = {torch.float32:torch.float32,
torch.float16:torch.float16,
torch.bfloat16:torch.float32,
}[weight_type]
input_type = {
torch.float32: torch.float32,
torch.float16: torch.float16,
torch.bfloat16: torch.float32,
}[weight_type]
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type)
q_a_norm = torch.ones(hidden_size, dtype=torch.float32)
kv_a_norm = torch.ones(hidden_size, dtype=torch.float32)
@@ -203,16 +208,16 @@ q_absorb = x_reshaped[:, 0]
out_absorb = x_reshaped[:, 1]
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
def build_mla():
os.environ["BLAS_NUM_THREADS"] = "1"
q_a_proj_weight = q_a_proj.weight.to(weight_type).to('cpu').contiguous()
q_b_proj_weight = q_b_proj.weight.to(weight_type).to('cpu').contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(weight_type).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to('cpu').contiguous()
o_proj_weight = o_proj.weight.to(weight_type).to('cpu').contiguous()
q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous()
config = kt_kernel_ext.mla.MLAConfig(
hidden_size,
@@ -244,7 +249,6 @@ def build_mla():
config.kv_a_norm = kv_a_norm.data_ptr()
config.kv_a_norm_type = ggml_type.FP32
if weight_type == torch.float32:
config.q_a_proj_type = ggml_type.FP32
config.q_b_proj_type = ggml_type.FP32
@@ -266,10 +270,8 @@ def build_mla():
else:
raise ValueError(f"Unsupported data type: {weight_type}")
config.pool = CPUInfer.backend_
if weight_type == torch.float32:
mla = kt_kernel_ext.mla.MLA_F32(config)
elif weight_type == torch.float16:
@@ -284,19 +286,14 @@ def build_mla():
return mla
def load_fp32_tensor(file_path, shape):
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
# page3 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank))
# page3_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank))
@@ -320,7 +317,6 @@ def load_fp32_tensor(file_path, shape):
# print(f'PE Attention Weights Diff: ave:{diff.mean()}, max:{diff.max()}')
# raw_attn_w_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_raw_attention_weights.f32',(1,max_kvlen))
# raw_attn_w_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_raw_attention_weights.f32',(qlen,max_kvlen))
# diff = torch.abs(raw_attn_w_1 - raw_attn_w_2[-1])
@@ -334,22 +330,16 @@ def load_fp32_tensor(file_path, shape):
# print(f'Output Diff: ave:{diff.mean()}, max:{diff.max()}')
mla = build_mla()
output = torch.zeros((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
mla.forward([qlen],[page_table],[kvlen],hidden_states.data_ptr(),output.data_ptr())
print("CPU MLA Output: ",output[-1])
output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr())
print("CPU MLA Output: ", output[-1])
output_2 = torch.zeros((1, hidden_size), dtype=input_type).to('cpu').contiguous()
mla.forward([1],[page_table],[qlen-1],hidden_states[-1].data_ptr(),output_2.data_ptr())
print("CPU MLA Output 2: ",output_2[-1])
output_2 = torch.zeros((1, hidden_size), dtype=input_type).to("cpu").contiguous()
mla.forward([1], [page_table], [qlen - 1], hidden_states[-1].data_ptr(), output_2.data_ptr())
print("CPU MLA Output 2: ", output_2[-1])
diff = torch.abs(output[-1] - output_2[-1])
print(f'Diff: ave:{diff.mean()}, max:{diff.max()}')
print(f"Diff: ave:{diff.mean()}, max:{diff.max()}")
assert diff.max() < 1e-1, "CPU and Torch outputs are not close enough!"

View File

@@ -1,59 +1,62 @@
import logging
import os,sys
import os, sys
import time
from typing import Optional
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
def load_fp32_tensor_raw(file_path):
# return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
return tensor
def load_fp16_tensor(file_path, shape=None):
# return load_fp32_tensor(file_path, shape)
return load_fp32_tensor_raw(file_path)
# return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=weight_type)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
def load_fp32_tensor(file_path, shape):
# return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
def test_torch():
torch.set_grad_enabled(False)
hidden_states_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_input.bin')
hidden_states_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_input.bin')
hidden_states_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_input.bin")
hidden_states_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_input.bin")
# diff = torch.abs(hidden_states_to_check_prefill - hidden_states_to_check_decode).max()
# print("hidden_states diff -> ", diff)
q_lora_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_qlora.bin')
q_lora_to_check_test_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_qlora_test.bin')
q_lora_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_qlora.bin')
q_lora_to_check_test_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_qlora_test.bin')
q_lora_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora.bin")
q_lora_to_check_test_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora_test.bin")
q_lora_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora.bin")
q_lora_to_check_test_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora_test.bin")
# diff = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
# diff_test = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
# print("q_lora max diff -> ", diff)
@@ -63,8 +66,6 @@ def test_torch():
# print("q_lora mae -> ", mae)
# print("q_lora mae test -> ", mae_test)
# q_lora_norm = q_a_layernorm(q_lora)
# q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
# q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
@@ -94,10 +95,9 @@ def test_torch():
# )
# compressed_kv = compressed_kv.contiguous()
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
compressed_kv_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_kv_lora_rank')
compressed_kv_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_kv_lora_rank')
compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank")
compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank")
# diff = torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode).max()
# mae = torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode))
# print("compressed_kv diff -> ", diff)
@@ -107,16 +107,13 @@ def test_torch():
# k_pe is [qlen, 1, qk_rope_head_dim(64)]
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
compressed_kv_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm')
compressed_kv_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm')
compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm")
compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm")
# diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
# print("compressed_kv diff norm -> ", diff)
# print("compressed_kv mae norm -> ", mae)
# k_pe = k_pe.view(qlen, 1, rope_size)
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
# compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
@@ -137,8 +134,8 @@ def test_torch():
# print("q_nope[0] mae test -> ", mae_test)
# q_pe_nope = q_pe.transpose(0,1)
q_pe_0_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope')
q_pe_0_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope')
q_pe_0_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_q_rope")
q_pe_0_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_q_rope")
# q_pe_0_to_check_decode_test = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope_test')
# q_pe_0_to_check_prefill_test = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope_test')
@@ -185,7 +182,6 @@ def test_torch():
# diff = torch.abs(q_pe - q_new).max()
# print("q_pe diff -> ", diff)
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
# diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
# mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
@@ -257,9 +253,8 @@ def test_torch():
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
k_pe_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_k_rope', (256,64))
k_pe_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_k_rope', (256,64))
k_pe_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_k_rope", (256, 64))
k_pe_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_k_rope", (256, 64))
# diff = torch.abs(k_pe_to_check_prefill - k_pe_to_check_decode).max()
# mae = torch.mean(k_pe_to_check_prefill - k_pe_to_check_decode)
# print("k_pe diff -> ", diff)
@@ -267,8 +262,8 @@ def test_torch():
# pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
# kv_total_len = kv_page_nums * page_size
pe_weights_0_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_pe_attention_weights', (1024,4096))
pe_weights_0_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_pe_attention_weights', (1024,4096))
pe_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_pe_attention_weights", (1024, 4096))
pe_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_pe_attention_weights", (1024, 4096))
# diff = torch.abs(pe_weights[0] - pe_weights_0).max()
# print("pe_weights[0] diff -> ", diff)
@@ -290,17 +285,15 @@ def test_torch():
# attention_weights = (attention_weights + attention_masks)
# attention_weights shape is [num_heads(128), qlen, k_len]
# attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype)
attention_weights_0_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_attention_weights', (1024, 4096))
attention_weights_0_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_attention_weights', (1024, 4096))
attention_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_attention_weights", (1024, 4096))
attention_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_attention_weights", (1024, 4096))
# attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
# diff = torch.abs(attention_weights[0] - attention_weights_0).max()
# print("attention_weights[0] diff -> ", diff)
# attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
@@ -333,18 +326,15 @@ def test_torch():
# diff = torch.abs(h1_output - output_0_check).max()
# print("h1_output diff -> ", diff)
output_check_decode = load_fp16_tensor('./debug_decode/output.bin')
output_check_prefill = load_fp16_tensor('./debug_prefill/output.bin')
output_check_decode = load_fp16_tensor("./debug_decode/output.bin")
output_check_prefill = load_fp16_tensor("./debug_prefill/output.bin")
# diff = torch.abs(output - output_check).max()
# mae = torch.mean(torch.abs(output - output_check))
# print("output diff -> ", diff)
return None
torch.set_printoptions(sci_mode=False, precision=5)
# output_cpu = test_cpu_mla()
# output_cpu_quant = test_cpu_mla_quant()
@@ -361,7 +351,3 @@ output_torch = test_torch()
# print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}')
# assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"

View File

@@ -1,13 +1,14 @@
import os,sys
import os, sys
import time
from typing import Optional
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
seed = 42 # 你可以选择任何整数作为种子
@@ -19,7 +20,7 @@ kvlen = 0
page_table = range(20)
bsz_tensors=torch.tensor([1])
bsz_tensors = torch.tensor([1])
page_size = 256
@@ -41,7 +42,6 @@ max_kvlen = 4096
max_position_embeddings = 163840
rope_scaling = {
"beta_fast": 32,
"beta_slow": 1,
@@ -49,17 +49,16 @@ rope_scaling = {
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
"type": "yarn",
}
CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 100
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
kv_b_proj = nn.Linear(kv_lora_rank, num_heads * (nope_size + nope_size), bias=False, dtype=torch.float16)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)
@@ -70,13 +69,11 @@ init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
init.normal_(o_proj.weight, mean=0.0, std=0.02)
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to('cpu').contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to('cpu').contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to('cpu').contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to('cpu').contiguous()
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous()
config = kt_kernel_ext.mla.MLAConfig(
@@ -114,30 +111,27 @@ config.w_o_type = ggml_type.FP16
config.pool = CPUInfer.backend_
mla = kt_kernel_ext.mla.MLA(config)
mla.load_weights()
mla.set_local_pages(pages_count)
input = torch.randn((qlen, hidden_size), dtype=torch.float16).to('cpu').contiguous()
input = torch.randn((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous()
output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to('cpu').contiguous()
mla.forward([qlen],[page_table],[kvlen],input.data_ptr(),output.data_ptr())
print("CPU MLA Output: ",output)
output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous()
mla.forward([qlen], [page_table], [kvlen], input.data_ptr(), output.data_ptr())
print("CPU MLA Output: ", output)
softmax_scale = (nope_size + rope_size) ** -0.5
# 1代表的是压缩的kv的头数
k_caches = torch.randn(1,pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16)
k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16)
kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)
q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)
x = torch.randn(q_lora_rank, dtype=torch.float16)*100
x = torch.randn(q_lora_rank, dtype=torch.float16) * 100
print(x)
print(q_a_layernorm(x))
@@ -163,25 +157,26 @@ rotary_emb = DeepseekV3YarnRotaryEmbedding(
# last_page_len = [qlen+kvlen,...] layer_idx = 1
# position_ids = [kvlen:qlen+kvlen]
hidden_states = torch.randn(qlen, hidden_size, dtype=torch.float16)
q_indptr = torch.tensor([0,qlen]).to(torch.int32)
q_indptr = torch.tensor([0, qlen]).to(torch.int32)
kv_indptr = torch.tensor([0,(qlen+kvlen+page_size-1)//page_size]).to(torch.int32)
kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32)
kv_indices = torch.tensor(range(pages_count)).to(torch.int32)
page_idx = torch.tensor([i//page_size for i in range(kvlen,kvlen+qlen)] ).to(torch.int32)
page_offset = torch.tensor( [i%page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
last_page_len = torch.tensor([(qlen+kvlen)%page_size], device=hidden_states.device)
last_page_len = torch.tensor([(qlen + kvlen) % page_size], device=hidden_states.device)
position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32)
# 按照行创建 mask [qlen,kvlen+qlen]
attention_masks = torch.zeros((qlen, kvlen + qlen), dtype=torch.float16)
for i in range(qlen):
attention_masks[i, i + kvlen + 1: i + kvlen + qlen] = -65504.0
attention_masks[i, i + kvlen + 1 : i + kvlen + qlen] = -65504.0
def torch_attn(hidden_states: torch.Tensor,
def torch_attn(
hidden_states: torch.Tensor,
kv_cache: KDeepSeekV3Cache,
position_ids: torch.Tensor,
page_idx: torch.Tensor,
@@ -193,58 +188,54 @@ def torch_attn(hidden_states: torch.Tensor,
bsz_tensors: Optional[torch.Tensor] = None,
last_page_len: Optional[torch.Tensor] = None,
layer_idx: Optional[int] = None,
):
):
global out_absorb
global q_absorb
# range bsz_tensors
final_attention_output = torch.tensor([], device=hidden_states.device)
for i in range(bsz_tensors[0]):
batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i]
batch_last_page_len = last_page_len[i]
# kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]]
batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]]
# kv_page_nums is the number of pages for the current batch
kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
kv_page_nums = kv_indptr[i + 1] - kv_indptr[i]
# kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
kv_total_len = kv_page_nums * page_size
if batch_last_page_len is not None:
kv_total_len = kv_total_len - (page_size - batch_last_page_len)
# print(f"kv_total_len's shape {kv_total_len.shape}")
# kv_index is the index of the kv cache pages for the current batch
kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]]
# we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
# from q_indptr[i] to q_indptr[i+1] is the range of the current batch
batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]]
batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]]
qlen, _ = batch_hidden_states.size()
# print("qlen -> ", qlen)
q_lora = q_a_proj(batch_hidden_states)
print('q_a_proj',q_a_proj.weight)
print('q_lora',q_lora)
print("q_a_proj", q_a_proj.weight)
print("q_lora", q_lora)
q = q_b_proj(q_a_layernorm(q_lora))
print('q_b_proj',q_b_proj.weight)
print("q_b_proj", q_b_proj.weight)
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
q = q.view(qlen, num_heads, nope_size+rope_size)
q = q.view(qlen, num_heads, nope_size + rope_size)
# q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
# q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
q_nope, q_pe = torch.split(
q, [nope_size, rope_size], dim=-1
)
print('q_nope',q_nope)
print('q_pe',q_pe)
q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1)
print("q_nope", q_nope)
print("q_pe", q_pe)
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
compressed_kv, k_pe = torch.split(
compressed_kv, [kv_lora_rank, rope_size], dim=-1
)
compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1)
compressed_kv = compressed_kv.contiguous()
compressed_kv = kv_a_layernorm(compressed_kv)
# k_pe is [qlen, 1, qk_rope_head_dim(64)]
print('compressed_kv ',compressed_kv)
print('k_pe ',k_pe)
print("compressed_kv ", compressed_kv)
print("k_pe ", k_pe)
k_pe = k_pe.view(qlen, 1, rope_size)
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
@@ -256,10 +247,17 @@ def torch_attn(hidden_states: torch.Tensor,
# q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
q_pe.transpose_(0, 1)
if kv_cache is not None:
cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models
compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
cache_kwargs = {
"sin": sin,
"cos": cos,
"page_idx": batch_page_idx,
"page_offset": batch_page_offset,
} # Specific to RoPE models
compressed_kv_with_k_pe = kv_cache.update(
compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs
)
compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
# q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
# out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
# q_absorb, out_absorb = get_absorbed()
@@ -297,8 +295,8 @@ def torch_attn(hidden_states: torch.Tensor,
break
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
print('pe_weights',pe_weights)
pe_weights = torch.matmul(q_pe, batch_k_pe.mT)
print("pe_weights", pe_weights)
attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) * softmax_scale
# attention_weights is [num_heads(128), qlen, k_len]
@@ -306,12 +304,12 @@ def torch_attn(hidden_states: torch.Tensor,
# attention_masks[i] is [qlen, k_len]
attention_weights = (attention_weights + attention_masks[i])
attention_weights = attention_weights + attention_masks[i]
# attention_weights shape is [num_heads(128), qlen, k_len]
attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=torch.float16).to(q_pe.dtype)
attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=torch.float16).to(q_pe.dtype)
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
out_absorb = out_absorb.transpose(1,2)
out_absorb = out_absorb.transpose(1, 2)
# q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
@@ -321,7 +319,6 @@ def torch_attn(hidden_states: torch.Tensor,
return final_attention_output
torch_output = torch_attn(
input,
kv_cache,
@@ -334,14 +331,6 @@ torch_output = torch_attn(
kv_indptr=kv_indptr,
bsz_tensors=bsz_tensors,
last_page_len=last_page_len,
layer_idx=0
)
print("Torch Output: ",torch_output)
layer_idx=0,
)
print("Torch Output: ", torch_output)

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# coding=utf-8
'''
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
@@ -8,11 +8,12 @@ Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:37:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
hidden_size = 5120
@@ -28,9 +29,11 @@ layer_num = 10
CPUInfer = kt_kernel_ext.CPUInfer(48)
validation_iter = 100
def act_fn(x):
return x / (1.0 + torch.exp(-x))
def mlp_torch(input, gate_proj, up_proj, down_proj):
gate_buf = torch.mm(input, gate_proj.t())
up_buf = torch.mm(input, up_proj.t())
@@ -38,16 +41,35 @@ def mlp_torch(input, gate_proj, up_proj, down_proj):
ret = torch.mm(intermediate, down_proj.t())
return ret
with torch.inference_mode(mode=True):
mlps = []
gate_projs = []
up_projs = []
down_projs = []
for _ in range(layer_num):
gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
config = kt_kernel_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
gate_proj = (
torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
)
up_proj = (
torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
)
down_proj = (
torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
)
config = kt_kernel_ext.mlp.MLPConfig(
hidden_size,
intermediate_size,
stride,
group_max_len,
gate_proj.data_ptr(),
up_proj.data_ptr(),
down_proj.data_ptr(),
gate_type,
up_type,
down_type,
hidden_type,
)
mlp = kt_kernel_ext.mlp.MLP(config)
gate_projs.append(gate_proj)
up_projs.append(up_proj)
@@ -61,22 +83,16 @@ with torch.inference_mode(mode=True):
output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
input = input / 100
CPUInfer.submit(
mlp.forward(
qlen,
input.data_ptr(),
output.data_ptr()
)
)
CPUInfer.submit(mlp.forward(qlen, input.data_ptr(), output.data_ptr()))
CPUInfer.sync()
# print('cpuinfer output', output)
gate_proj = gate_projs[i%layer_num]
up_proj = up_projs[i%layer_num]
down_proj = down_projs[i%layer_num]
gate_proj = gate_projs[i % layer_num]
up_proj = up_projs[i % layer_num]
down_proj = down_projs[i % layer_num]
t_output = mlp_torch(input, gate_proj, up_proj, down_proj)
# print('torch output', t_output)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print('diff = ', diff)
assert(diff < 0.001)
print("diff = ", diff)
assert diff < 0.001

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# coding=utf-8
'''
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
@@ -8,11 +8,12 @@ Version : 1.0.0
LastEditors : SkqLiao
LastEditTime : 2025-03-13 11:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
import os, sys
import time
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
from tqdm import tqdm
from kt_kernel_ext.kvcache import ggml_type
@@ -20,7 +21,7 @@ from kt_kernel_ext.kvcache import ggml_type
torch.manual_seed(0)
expert_num = 8
hidden_size = 2048 #7168
hidden_size = 2048 # 7168
intermediate_size = 2048
stride = 32
group_min_len = 10
@@ -39,9 +40,11 @@ layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 10
def act_fn(x):
return x / (1.0 + torch.exp(-x))
def mlp_torch(input, gate_proj, up_proj, down_proj):
gate_buf = torch.mm(input, gate_proj.t())
up_buf = torch.mm(input, up_proj.t())
@@ -49,6 +52,7 @@ def mlp_torch(input, gate_proj, up_proj, down_proj):
ret = torch.mm(intermediate, down_proj.t())
return ret
def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
cnts.scatter_(1, expert_ids, 1)
@@ -85,10 +89,12 @@ def to_cpuinfer_tensor(tensor, type):
size = torch.prod(torch.tensor(tensor.shape, dtype=torch.int32)).item()
return kt_kernel_ext.utils.from_float(tensor.data_ptr(), size, type)
def from_cpuinfer_tensor(tensor, size, type):
return kt_kernel_ext.utils.to_float(tensor.data_ptr(), size, type)
qlens = [1,64] #[64, 512, 2048, 8192, 16384]
qlens = [1, 64] # [64, 512, 2048, 8192, 16384]
# gate_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K]
# up_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K]
# down_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q5_K]
@@ -96,8 +102,8 @@ gate_types = [ggml_type.Q4_K]
up_types = [ggml_type.Q4_K]
down_types = [ggml_type.Q6_K]
hidden_type = ggml_type.BF16
print(f'Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}')
print(f'group_max_len: ', group_max_len)
print(f"Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}")
print(f"group_max_len: ", group_max_len)
for qlen in qlens:
for gate_type, up_type, down_type in zip(gate_types, up_types, down_types):
@@ -106,13 +112,25 @@ for qlen in qlens:
gate_projs = []
up_projs = []
down_projs = []
print('Preparing data...')
print("Preparing data...")
converted_tensors = []
for _ in range(layer_num):
size = expert_num * intermediate_size * hidden_size
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
gate_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
up_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
down_proj = (
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
gate_tensor = to_cpuinfer_tensor(gate_proj, gate_type)
up_tensor = to_cpuinfer_tensor(up_proj, up_type)
@@ -131,7 +149,6 @@ for qlen in qlens:
config.down_type = down_type
config.hidden_type = hidden_type
moe = kt_kernel_ext.moe.MOE(config)
gate_projs.append(gate_proj)
up_projs.append(up_proj)
@@ -140,19 +157,21 @@ for qlen in qlens:
CPUInfer.sync()
moes.append(moe)
converted_tensors.append((gate_tensor, up_tensor, down_tensor))
print('Finished initialization!')
print("Finished initialization!")
CPUInfer.submit(moes[0].warm_up_task())
CPUInfer.sync()
print('Warm up finished!')
print("Warm up finished!")
# validation
progress_bar = tqdm(range(validation_iter), desc="Starting")
total_diff = 0
for i in tqdm(progress_bar):
progress_bar.set_description('Round: {}/{}'.format(i + 1, validation_iter))
expert_ids = torch.stack([torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]).contiguous()
progress_bar.set_description("Round: {}/{}".format(i + 1, validation_iter))
expert_ids = torch.stack(
[torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
).contiguous()
weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
input_proj = torch.randn((qlen, hidden_size), dtype=torch.float32).contiguous() / 100
output_proj = torch.empty((qlen, hidden_size), dtype=torch.float32).contiguous()
@@ -175,15 +194,17 @@ for qlen in qlens:
CPUInfer.sync()
cpu_output = from_cpuinfer_tensor(output_tensor, qlen * hidden_size, hidden_type)
gate_proj = gate_projs[i%layer_num]
up_proj = up_projs[i%layer_num]
down_proj = down_projs[i%layer_num]
gate_proj = gate_projs[i % layer_num]
up_proj = up_projs[i % layer_num]
down_proj = down_projs[i % layer_num]
t_output = moe_torch(input_proj, expert_ids, weights, gate_proj, up_proj, down_proj)
print('cpuinfer output', cpu_output)
print('torch output', t_output)
diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean(torch.abs(t_output.flatten()))
print("cpuinfer output", cpu_output)
print("torch output", t_output)
diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean(
torch.abs(t_output.flatten())
)
assert diff < 0.5
total_diff += diff
print(f'gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}')
print(f'Average diff: {total_diff / validation_iter:.4f}')
print(f"gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}")
print(f"Average diff: {total_diff / validation_iter:.4f}")

View File

@@ -4,7 +4,7 @@ sys.path.insert(0, os.path.dirname(__file__) + "/../build")
print("sys.path:", sys.path)
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
expert_num = 256
hidden_size = 7168

View File

@@ -15,7 +15,7 @@ import time
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
os.environ["BLAS_NUM_THREADS"] = "1"
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
expert_num = 16

View File

@@ -14,7 +14,7 @@ import time
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
os.environ["BLAS_NUM_THREADS"] = "1"
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
expert_num = 16

View File

@@ -15,7 +15,7 @@ from abc import ABC, abstractmethod
import os
import ctypes
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
class KExpertsCPUBuffer: