[fix](test): fix import kt-kernel (#1728)

This commit is contained in:
ErvinXie
2025-12-17 19:46:32 +08:00
committed by GitHub
parent 6fc4080a7d
commit a8667ddb58
33 changed files with 1063 additions and 1151 deletions

View File

@@ -1,19 +1,19 @@
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : Jianwei Dong
LastEditors : Jianwei Dong
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
layer_num = 10
@@ -61,11 +61,7 @@ def bench_linear(cache_seqlen: int):
max_thread_num,
)
local_kvcache = kt_kernel_ext.kvcache.KVCache(config)
block_table = (
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
.contiguous()
.view(1, -1)
)
block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1)
for layer_idx in range(layer_num):
k_cache = torch.randn(
@@ -93,17 +89,11 @@ def bench_linear(cache_seqlen: int):
)
CPUInfer.sync()
input = torch.randn(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
output = torch.empty(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
# attn_lse: (bsz, q_len, q_head_num)
attn_lse = torch.empty(
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
).contiguous()
attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous()
input = input / 100
# warm up
@@ -156,16 +146,7 @@ def bench_linear(cache_seqlen: int):
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
cache_seqlen
* kv_head_num
* head_dim
* 2
* 2
* test_iter
/ total_time
/ 1000
/ 1000
/ 1000,
cache_seqlen * kv_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000,
"GB/s",
)
print("")

View File

@@ -1,19 +1,19 @@
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : Jianwei Dong
LastEditors : Jianwei Dong
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
layer_num = 10
@@ -45,9 +45,7 @@ def bench_linear(cache_seqlen: int, device):
kvcaches.append((k_cache, v_cache))
input = torch.randn(
(1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
).contiguous()
input = torch.randn((1, q_head_num, 1, head_dim), dtype=torch.float16, device=device).contiguous()
input = input / 100
# warm up
@@ -70,16 +68,7 @@ def bench_linear(cache_seqlen: int, device):
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
cache_seqlen
* q_head_num
* head_dim
* 2
* 2
* test_iter
/ total_time
/ 1000
/ 1000
/ 1000,
cache_seqlen * q_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000,
"GB/s",
)
print("")

View File

@@ -15,7 +15,7 @@ from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
# Benchmark parameters (single MoE, no layer loop)
@@ -29,9 +29,7 @@ warm_up_iter = 1000
test_iter = 5000
k_group_size = 32
physical_to_logical_map = (
torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
)
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
@@ -43,24 +41,12 @@ CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
def get_git_commit():
result = {}
try:
commit = (
subprocess.check_output(["git", "rev-parse", "HEAD"])
.decode("utf-8")
.strip()
)
commit_msg = (
subprocess.check_output(["git", "log", "-1", "--pretty=%B"])
.decode("utf-8")
.strip()
)
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
result["commit"] = commit
result["commit_message"] = commit_msg
dirty_output = (
subprocess.check_output(["git", "status", "--porcelain"])
.decode("utf-8")
.strip()
)
dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
if dirty_output:
result["dirty"] = True
result["dirty_files"] = dirty_output.splitlines()
@@ -132,9 +118,7 @@ def record_results(result, filename=json_path):
f.write(json.dumps(result) + "\n")
def pack_to_int32(
value: torch.Tensor, num_bits: int, packed_dim: int = 1
) -> torch.Tensor:
def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: int = 1) -> torch.Tensor:
if value.dtype is not torch.int8:
raise ValueError("Tensor must be torch.int8 before packing")
if not (1 <= num_bits <= 8):
@@ -181,9 +165,7 @@ def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
weights_f32 = weights.to(torch.float32)
e, rows, cols = weights_f32.shape
if cols % group_size != 0 or cols % 2 != 0:
raise ValueError(
f"cols ({cols}) must be divisible by group_size ({group_size}) and 2"
)
raise ValueError(f"cols ({cols}) must be divisible by group_size ({group_size}) and 2")
reshaped = weights_f32.view(e, rows, cols // group_size, group_size)
max_abs = reshaped.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8)
@@ -191,9 +173,7 @@ def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
q = torch.round(reshaped / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8)
q = q.view(e, rows, cols)
packed = pack_tensor_per_row(q, num_bits=4).view(e, rows, cols // 8).contiguous()
scales = scales.to(torch.bfloat16).contiguous().view(
e, rows, cols // group_size
).contiguous()
scales = scales.to(torch.bfloat16).contiguous().view(e, rows, cols // group_size).contiguous()
return packed, scales
@@ -233,9 +213,7 @@ def bench_k2_moe():
bytes_per_elem = 0.5 + 2.0 / k_group_size
quant_data = build_quantized_layer_weights()
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
config.max_len = max_len
config.quant_config.bits = 4
config.quant_config.group_size = k_group_size
@@ -261,12 +239,8 @@ def bench_k2_moe():
.reshape(gen_iter, qlen * num_experts_per_tok)
.contiguous()
)
weights = torch.rand(
(gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu"
).contiguous()
input_tensor = torch.randn(
(qlen, hidden_size), dtype=torch.bfloat16, device="cpu"
).contiguous()
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
output_tensor = torch.empty_like(input_tensor)
bsz_tensor = torch.tensor([qlen], device="cpu")
@@ -313,17 +287,7 @@ def bench_k2_moe():
/ total_time
/ 1e9
)
flops = (
hidden_size
* intermediate_size
* qlen
* 3
* num_experts_per_tok
* 2
* test_iter
/ total_time
/ 1e12
)
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
print("Quant mode: int4_k2")
print("Time(s): ", total_time)

View File

@@ -14,7 +14,7 @@ from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
# Benchmark parameters (single MoE, mirror examples/test_k2_write_buffer.py)
@@ -39,20 +39,12 @@ CPUInfer = kt_kernel_ext.CPUInfer(96)
def get_git_commit():
result = {}
try:
commit = (
subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
)
commit_msg = (
subprocess.check_output(["git", "log", "-1", "--pretty=%B"])
.decode("utf-8")
.strip()
)
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
result["commit"] = commit
result["commit_message"] = commit_msg
dirty_output = (
subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
)
dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
if dirty_output:
result["dirty"] = True
result["dirty_files"] = dirty_output.splitlines()
@@ -160,9 +152,7 @@ def build_moe():
per_mat_scale_elems,
) = allocate_weights()
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
config.max_len = max_len
config.quant_config.bits = 4
config.quant_config.group_size = group_size
@@ -186,18 +176,10 @@ def build_moe():
total_weight_bytes_per_tp = gpu_experts_num * weight_bytes_per_expert_per_tp
total_scale_elems_per_tp = gpu_experts_num * scale_elems_per_expert_per_tp
w13_weight_bufs = [
torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)
]
w13_scale_bufs = [
torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)
]
w2_weight_bufs = [
torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)
]
w2_scale_bufs = [
torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)
]
w13_weight_bufs = [torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
w13_scale_bufs = [torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)]
w2_weight_bufs = [torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
w2_scale_bufs = [torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)]
buffer_ptrs = {
"w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
@@ -248,7 +230,7 @@ def bench_write_buffer():
)
)
CPUInfer.sync()
total_time = 0
for _ in tqdm(range(test_iter), desc="Testing"):
start = time.perf_counter()
@@ -265,8 +247,6 @@ def bench_write_buffer():
time.sleep(0.6)
print(end - start)
time_per_iter_us = total_time / test_iter * 1e6
bandwidth_gbs = bytes_per_call * test_iter / total_time / 1e9

View File

@@ -1,18 +1,19 @@
#!/usr/bin/env python
# coding=utf-8
'''
Description :
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:31:59
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:35:35
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
input_size = 16384
@@ -25,65 +26,64 @@ CPUInfer = kt_kernel_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000
def bench_linear(quant_mode: str):
with torch.inference_mode(mode=True):
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
if quant_mode == "fp32":
proj_type = 0 # ggml_type::GGML_TYPE_F32
proj_type = 0 # ggml_type::GGML_TYPE_F32
bytes_per_elem = 4.000000
elif quant_mode == "fp16":
proj_type = 1 # ggml_type::GGML_TYPE_F16
proj_type = 1 # ggml_type::GGML_TYPE_F16
bytes_per_elem = 2.000000
elif quant_mode == "bf16":
proj_type = 30 # ggml_type::GGML_TYPE_BF16
proj_type = 30 # ggml_type::GGML_TYPE_BF16
bytes_per_elem = 2.000000
elif quant_mode == "q8_0":
proj_type = 8 # ggml_type::GGML_TYPE_Q8_0
proj_type = 8 # ggml_type::GGML_TYPE_Q8_0
bytes_per_elem = 1.062500
elif quant_mode == "q6_k":
proj_type = 14 # ggml_type::GGML_TYPE_Q6_K
proj_type = 14 # ggml_type::GGML_TYPE_Q6_K
bytes_per_elem = 0.820312
elif quant_mode == "q5_k_m":
proj_type = 13 # ggml_type::GGML_TYPE_Q5_K
proj_type = 13 # ggml_type::GGML_TYPE_Q5_K
bytes_per_elem = 0.687500
elif quant_mode == "q4_k_m":
proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
bytes_per_elem = 0.562500
elif quant_mode == "q3_k_m":
proj_type = 11 # ggml_type::GGML_TYPE_Q3_K
proj_type = 11 # ggml_type::GGML_TYPE_Q3_K
bytes_per_elem = 0.429688
elif quant_mode == "q2_k":
proj_type = 10 # ggml_type::GGML_TYPE_Q2_K
proj_type = 10 # ggml_type::GGML_TYPE_Q2_K
bytes_per_elem = 0.328125
elif quant_mode == "iq3_xs":
proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S
proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S
bytes_per_elem = 0.429688
elif quant_mode == "iq2_xxs":
proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
bytes_per_elem = 0.257812
else:
assert(False)
assert False
linears = []
projs = []
for _ in range(layer_num):
proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
config = kt_kernel_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
proj = torch.randn((output_size, input_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
config = kt_kernel_ext.linear.LinearConfig(
input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type
)
linear = kt_kernel_ext.linear.Linear(config)
projs.append(proj)
linears.append(linear)
input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
# warm up
for i in range(warm_up_iter):
CPUInfer.submit(
linears[i % layer_num].forward(
qlen,
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr()
)
linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
)
CPUInfer.sync()
@@ -91,21 +91,22 @@ def bench_linear(quant_mode: str):
start = time.perf_counter()
for i in range(test_iter):
CPUInfer.submit(
linears[i % layer_num].forward(
qlen,
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr()
)
linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
)
CPUInfer.sync()
end = time.perf_counter()
total_time = end - start
print('Quant mode: ', quant_mode)
print('Time(s): ', total_time)
print('Iteration: ', test_iter)
print('Time(us) per iteration: ', total_time / test_iter * 1000000)
print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
print('')
print("Quant mode: ", quant_mode)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000,
"GB/s",
)
print("")
bench_linear("fp32")
bench_linear("fp16")

View File

@@ -3,9 +3,10 @@ import time
import subprocess
import platform
import json
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
import kt_kernel_ext
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
@@ -31,9 +32,9 @@ layer_num = 10
rope_theta = 10000
max_qlen = qlen+kvlen
max_qlen = qlen + kvlen
max_kvlen = 4096
max_position_embeddings = 163840
max_position_embeddings = 163840
rope_scaling = {
"beta_fast": 32,
@@ -42,7 +43,7 @@ rope_scaling = {
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
"type": "yarn",
}
CPUINFER_PARAM = 304
@@ -54,13 +55,12 @@ warm_up_iter = 20
test_iter = 100
# 获取脚本相关信息,用于生成结果保存文件名
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, "bench_results "+ ".jsonl")
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")
def get_git_commit():
"""
@@ -100,9 +100,9 @@ def get_system_info():
# 获取 CPU 型号(仅 Linux 支持)
cpu_model = None
if os.path.exists('/proc/cpuinfo'):
if os.path.exists("/proc/cpuinfo"):
try:
with open('/proc/cpuinfo', 'r') as f:
with open("/proc/cpuinfo", "r") as f:
for line in f:
if "model name" in line:
cpu_model = line.split(":", 1)[1].strip()
@@ -113,9 +113,9 @@ def get_system_info():
# 获取内存大小单位GB仅 Linux 支持
mem_total_gb = None
if os.path.exists('/proc/meminfo'):
if os.path.exists("/proc/meminfo"):
try:
with open('/proc/meminfo', 'r') as f:
with open("/proc/meminfo", "r") as f:
for line in f:
if "MemTotal" in line:
mem_kb = float(line.split(":", 1)[1].split()[0])
@@ -149,6 +149,7 @@ def record_results(result, filename=json_path):
with open(filename, "a") as f:
f.write(json.dumps(result) + "\n")
def bench_mla(quant_mode: str):
"""
测试 MLA 模型的性能
@@ -171,22 +172,22 @@ def bench_mla(quant_mode: str):
w_o_type = 1
bytes_per_elem = 2.000000
elif quant_mode == "q4_k_m":
q_a_proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
q_a_proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
q_b_proj_type = 12
kv_a_proj_with_mqa_type = 12 # ggml_type::GGML_TYPE_Q6_K
kv_a_proj_with_mqa_type = 12 # ggml_type::GGML_TYPE_Q6_K
kv_b_proj_type = 12
w_o_type = 12
bytes_per_elem = 0.5625
else:
raise ValueError("不支持的量化模式")
# 构建各层 MLA 模型的输入数据
# 构建各层 MLA 模型的输入数据
mlas = []
for i in tqdm(range(layer_num)):
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=torch.float16)
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=torch.float16)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)
init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
@@ -194,11 +195,11 @@ def bench_mla(quant_mode: str):
init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
init.normal_(o_proj.weight, mean=0.0, std=0.02)
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to('cpu').contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to('cpu').contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to('cpu').contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to('cpu').contiguous()
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous()
config = kt_kernel_ext.mla.MLAConfig(
hidden_size,
@@ -210,7 +211,7 @@ def bench_mla(quant_mode: str):
)
config.max_qlen = max_qlen
config.max_kvlen = max_kvlen
config.max_position_embeddings = max_position_embeddings
config.max_position_embeddings = max_position_embeddings
config.rope_scaling_factor = rope_scaling["factor"]
config.rope_theta = rope_theta
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
@@ -231,64 +232,85 @@ def bench_mla(quant_mode: str):
config.kv_b_proj_type = ggml_type.FP16
config.w_o_type = ggml_type.FP16
config.pool = CPUInfer.backend_
mla = kt_kernel_ext.mla.MLA(config)
mla.load_weights()
mla.set_local_pages(pages_count)
mlas.append(mla)
print('Generating data...')
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
print('Warming up...')
print("Generating data...")
input_tensor = (
torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
)
output_tensor = (
torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
)
print("Warming up...")
for i in tqdm(range(warm_up_iter)):
mlas[i%layer_num].forward([qlen],[page_table],[kvlen],
input_tensor[i%layer_num].data_ptr(),output_tensor[i%layer_num].data_ptr())
mlas[i % layer_num].forward(
[qlen],
[page_table],
[kvlen],
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
)
print('Start testing...')
print("Start testing...")
start = time.perf_counter()
for i in tqdm(range(test_iter)):
mlas[i%layer_num].forward([qlen],[page_table],[kvlen],
input_tensor[i%layer_num].data_ptr(),output_tensor[i%layer_num].data_ptr())
mlas[i % layer_num].forward(
[qlen],
[page_table],
[kvlen],
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
)
end = time.perf_counter()
total_time = end - start
time_per_iter_us = (total_time * 1e6) / test_iter
bandwidth = bytes_per_elem * (q_lora_rank * hidden_size
+ (kv_lora_rank+rope_size) * hidden_size
+ (nope_size+rope_size) * q_lora_rank * num_heads
+ (nope_size+nope_size)*kv_lora_rank * num_heads
+ hidden_size * nope_size * num_heads
+ hidden_size * qlen) * test_iter / (total_time * 1e9)
flops = 2*(
q_lora_rank*hidden_size*qlen
+ kv_lora_rank * hidden_size * qlen
+num_heads* (nope_size+rope_size)*q_lora_rank*qlen
+ num_heads * qlen * nope_size * kv_lora_rank
+ num_heads * (kvlen+qlen) * kv_lora_rank * qlen
+ num_heads * rope_size * qlen * (qlen+kvlen)
+ num_heads * kv_lora_rank * (qlen + kvlen) * qlen
+ num_heads * nope_size * kv_lora_rank * qlen
+ hidden_size * num_heads* nope_size * qlen
) * test_iter / (total_time * 1e12)
bandwidth = (
bytes_per_elem
* (
q_lora_rank * hidden_size
+ (kv_lora_rank + rope_size) * hidden_size
+ (nope_size + rope_size) * q_lora_rank * num_heads
+ (nope_size + nope_size) * kv_lora_rank * num_heads
+ hidden_size * nope_size * num_heads
+ hidden_size * qlen
)
* test_iter
/ (total_time * 1e9)
)
flops = (
2
* (
q_lora_rank * hidden_size * qlen
+ kv_lora_rank * hidden_size * qlen
+ num_heads * (nope_size + rope_size) * q_lora_rank * qlen
+ num_heads * qlen * nope_size * kv_lora_rank
+ num_heads * (kvlen + qlen) * kv_lora_rank * qlen
+ num_heads * rope_size * qlen * (qlen + kvlen)
+ num_heads * kv_lora_rank * (qlen + kvlen) * qlen
+ num_heads * nope_size * kv_lora_rank * qlen
+ hidden_size * num_heads * nope_size * qlen
)
* test_iter
/ (total_time * 1e12)
)
print('Quant mode:', quant_mode)
print('Time(s):', total_time)
print('Iteration:', test_iter)
print('Time(us) per iteration:', time_per_iter_us)
print('Bandwidth:', bandwidth, 'GB/s')
print('TFLOPS:', flops)
print('')
print("Quant mode:", quant_mode)
print("Time(s):", total_time)
print("Iteration:", test_iter)
print("Time(us) per iteration:", time_per_iter_us)
print("Bandwidth:", bandwidth, "GB/s")
print("TFLOPS:", flops)
print("")
# 整理测试结果
result = {
@@ -301,7 +323,7 @@ def bench_mla(quant_mode: str):
"flops_TFLOPS": flops,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
"test_parameters": {
"qlen": qlen,
"qlen": qlen,
"kvlen": kvlen,
"page_table": page_table,
"page_size": page_size,
@@ -312,21 +334,16 @@ def bench_mla(quant_mode: str):
"q_lora_rank": q_lora_rank,
"nope_size": nope_size,
"rope_size": rope_size,
"layer_num": layer_num,
"rope_theta": rope_theta,
"max_qlen": max_qlen,
"max_kvlen": max_kvlen,
"max_position_embeddings": max_position_embeddings,
"rope_scaling": rope_scaling,
"warm_up_iter": warm_up_iter,
"test_iter": test_iter,
"CPUInfer_parameter": CPUINFER_PARAM
}
"CPUInfer_parameter": CPUINFER_PARAM,
},
}
# 添加 git 与系统信息
result.update(get_git_commit())
@@ -334,6 +351,6 @@ def bench_mla(quant_mode: str):
# 将结果记录到 JSON 文件中
print(result)
record_results(result)
bench_mla("fp16")
bench_mla("fp16")

View File

@@ -1,18 +1,19 @@
#!/usr/bin/env python
# coding=utf-8
'''
Description :
"""
Description :
Author : chenht2022
Date : 2024-07-16 10:43:18
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:36:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
hidden_size = 5120
@@ -25,94 +26,108 @@ CPUInfer = kt_kernel_ext.CPUInfer(64)
warm_up_iter = 1000
test_iter = 10000
def bench_mlp(quant_mode: str):
with torch.inference_mode(mode=True):
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
if quant_mode == "fp32":
gate_type = 0 # ggml_type::GGML_TYPE_F32
up_type = 0 # ggml_type::GGML_TYPE_F32
down_type = 0 # ggml_type::GGML_TYPE_F32
gate_type = 0 # ggml_type::GGML_TYPE_F32
up_type = 0 # ggml_type::GGML_TYPE_F32
down_type = 0 # ggml_type::GGML_TYPE_F32
bytes_per_elem = 4.000000
elif quant_mode == "fp16":
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
bytes_per_elem = 2.000000
elif quant_mode == "bf16":
gate_type = 30 # ggml_type::GGML_TYPE_BF16
up_type = 30 # ggml_type::GGML_TYPE_BF16
down_type = 30 # ggml_type::GGML_TYPE_BF16
gate_type = 30 # ggml_type::GGML_TYPE_BF16
up_type = 30 # ggml_type::GGML_TYPE_BF16
down_type = 30 # ggml_type::GGML_TYPE_BF16
bytes_per_elem = 2.000000
elif quant_mode == "q8_0":
gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
up_type = 8 # ggml_type::GGML_TYPE_Q8_0
down_type = 8 # ggml_type::GGML_TYPE_Q8_0
gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
up_type = 8 # ggml_type::GGML_TYPE_Q8_0
down_type = 8 # ggml_type::GGML_TYPE_Q8_0
bytes_per_elem = 1.062500
elif quant_mode == "q6_k":
gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
up_type = 14 # ggml_type::GGML_TYPE_Q6_K
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
up_type = 14 # ggml_type::GGML_TYPE_Q6_K
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
bytes_per_elem = 0.820312
elif quant_mode == "q5_k_m":
gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
up_type = 13 # ggml_type::GGML_TYPE_Q5_K
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
up_type = 13 # ggml_type::GGML_TYPE_Q5_K
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
bytes_per_elem = 0.731771
elif quant_mode == "q4_k_m":
gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
up_type = 12 # ggml_type::GGML_TYPE_Q4_K
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
up_type = 12 # ggml_type::GGML_TYPE_Q4_K
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
bytes_per_elem = 0.648437
elif quant_mode == "q3_k_m":
gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
up_type = 11 # ggml_type::GGML_TYPE_Q3_K
down_type = 13 # ggml_type::GGML_TYPE_Q5_K
gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
up_type = 11 # ggml_type::GGML_TYPE_Q3_K
down_type = 13 # ggml_type::GGML_TYPE_Q5_K
bytes_per_elem = 0.515625
elif quant_mode == "q2_k":
gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
up_type = 10 # ggml_type::GGML_TYPE_Q2_K
down_type = 11 # ggml_type::GGML_TYPE_Q3_K
gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
up_type = 10 # ggml_type::GGML_TYPE_Q2_K
down_type = 11 # ggml_type::GGML_TYPE_Q3_K
bytes_per_elem = 0.328125
elif quant_mode == "iq3_xs":
gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
bytes_per_elem = 0.429688
elif quant_mode == "iq2_xxs":
gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
bytes_per_elem = 0.257812
else:
assert(False)
assert False
mlps = []
gate_projs = []
up_projs = []
down_projs = []
for _ in range(layer_num):
gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
config = kt_kernel_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
gate_proj = (
torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
)
up_proj = (
torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
)
down_proj = (
torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
)
config = kt_kernel_ext.mlp.MLPConfig(
hidden_size,
intermediate_size,
stride,
group_max_len,
gate_proj.data_ptr(),
up_proj.data_ptr(),
down_proj.data_ptr(),
gate_type,
up_type,
down_type,
hidden_type,
)
mlp = kt_kernel_ext.mlp.MLP(config)
gate_projs.append(gate_proj)
up_projs.append(up_proj)
down_projs.append(down_proj)
mlps.append(mlp)
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
# warm up
for i in range(warm_up_iter):
CPUInfer.submit(
mlps[i % layer_num].forward(
qlen,
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr()
)
mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
)
CPUInfer.sync()
@@ -120,21 +135,22 @@ def bench_mlp(quant_mode: str):
start = time.perf_counter()
for i in range(test_iter):
CPUInfer.submit(
mlps[i % layer_num].forward(
qlen,
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr()
)
mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
)
CPUInfer.sync()
end = time.perf_counter()
total_time = end - start
print('Quant mode: ', quant_mode)
print('Time(s): ', total_time)
print('Iteration: ', test_iter)
print('Time(us) per iteration: ', total_time / test_iter * 1000000)
print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
print('')
print("Quant mode: ", quant_mode)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000,
"GB/s",
)
print("")
bench_mlp("fp32")
bench_mlp("fp16")

View File

@@ -5,8 +5,8 @@ import json
import subprocess
import platform
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
import kt_kernel_ext
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
import torch
from tqdm import tqdm
@@ -35,7 +35,7 @@ CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, "bench_results "+ ".jsonl")
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")
def get_git_commit():
@@ -76,9 +76,9 @@ def get_system_info():
# 获取 CPU 型号(仅 Linux 支持)
cpu_model = None
if os.path.exists('/proc/cpuinfo'):
if os.path.exists("/proc/cpuinfo"):
try:
with open('/proc/cpuinfo', 'r') as f:
with open("/proc/cpuinfo", "r") as f:
for line in f:
if "model name" in line:
cpu_model = line.split(":", 1)[1].strip()
@@ -89,9 +89,9 @@ def get_system_info():
# 获取内存大小单位GB仅 Linux 支持
mem_total_gb = None
if os.path.exists('/proc/meminfo'):
if os.path.exists("/proc/meminfo"):
try:
with open('/proc/meminfo', 'r') as f:
with open("/proc/meminfo", "r") as f:
for line in f:
if "MemTotal" in line:
mem_kb = float(line.split(":", 1)[1].split()[0])
@@ -134,57 +134,57 @@ def bench_moe(quant_mode: str):
# 根据量化模式设置数据类型与 bytes_per_elem
hidden_type = 30 # ggml_type::GGML_TYPE_BF16固定
if quant_mode == "fp32":
gate_type = 0 # ggml_type::GGML_TYPE_F32
gate_type = 0 # ggml_type::GGML_TYPE_F32
up_type = 0
down_type = 0
bytes_per_elem = 4.0
elif quant_mode == "fp16":
gate_type = 1 # ggml_type::GGML_TYPE_F16
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1
down_type = 1
bytes_per_elem = 2.0
elif quant_mode == "bf16":
gate_type = 30 # ggml_type::GGML_TYPE_BF16
gate_type = 30 # ggml_type::GGML_TYPE_BF16
up_type = 30
down_type = 30
bytes_per_elem = 2.0
elif quant_mode == "q8_0":
gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
up_type = 8
down_type = 8
bytes_per_elem = 1.062500
elif quant_mode == "q6_k":
gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
up_type = 14
down_type = 14
bytes_per_elem = 0.820312
elif quant_mode == "q5_k_m":
gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
up_type = 13
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
bytes_per_elem = 0.731771
elif quant_mode == "q4_k_m":
gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
up_type = 12
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
bytes_per_elem = 0.648437
elif quant_mode == "q3_k_m":
gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
up_type = 11
down_type = 13 # ggml_type::GGML_TYPE_Q5_K
down_type = 13 # ggml_type::GGML_TYPE_Q5_K
bytes_per_elem = 0.515625
elif quant_mode == "q2_k":
gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
up_type = 10
down_type = 11 # ggml_type::GGML_TYPE_Q3_K
down_type = 11 # ggml_type::GGML_TYPE_Q3_K
bytes_per_elem = 0.328125
elif quant_mode == "iq3_xs":
gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
up_type = 21
down_type = 21
bytes_per_elem = 0.429688
elif quant_mode == "iq2_xxs":
gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
up_type = 16
down_type = 16
bytes_per_elem = 0.257812
@@ -194,13 +194,25 @@ def bench_moe(quant_mode: str):
# 构建各层 MoE 模型
moes = []
for _ in tqdm(range(layer_num), desc="Initializing MOEs"):
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
gate_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu")
.to("cpu")
.contiguous()
)
up_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu")
.to("cpu")
.contiguous()
)
down_proj = (
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu")
.to("cpu")
.contiguous()
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
config.pool = CPUInfer.backend_
config.m_block = m_block
config.m_block = m_block
config.group_min_len = group_min_len
config.group_max_len = group_max_len
config.gate_proj = gate_proj.data_ptr()
@@ -215,47 +227,52 @@ def bench_moe(quant_mode: str):
CPUInfer.submit(moe.load_weights_task())
CPUInfer.sync()
moes.append(moe)
# 生成输入数据
print('Generating data...')
print("Generating data...")
# 专家路由索引与权重,每层一个
gen_iter = 1000
expert_ids = torch.rand(gen_iter * qlen , expert_num, device="cpu").argsort(dim=-1)[:, :num_experts_per_tok].reshape(gen_iter, qlen * num_experts_per_tok).contiguous()
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
expert_ids = (
torch.rand(gen_iter * qlen, expert_num, device="cpu")
.argsort(dim=-1)[:, :num_experts_per_tok]
.reshape(gen_iter, qlen * num_experts_per_tok)
.contiguous()
)
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
# 将 qlen 封装成 tensor用于 forward 调用
qlen_tensor = torch.tensor([qlen], dtype=torch.int32)
# 预热阶段
print('Warming up...')
print("Warming up...")
for i in tqdm(range(warm_up_iter), desc="Warm-up"):
CPUInfer.submit(
moes[i % layer_num].forward_task(
qlen_tensor.data_ptr(),
num_experts_per_tok,
expert_ids[i%gen_iter].data_ptr(),
weights[i%gen_iter].data_ptr(),
expert_ids[i % gen_iter].data_ptr(),
weights[i % gen_iter].data_ptr(),
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
False
False,
)
)
CPUInfer.sync()
# 测试阶段
print('Start testing...')
print("Start testing...")
start = time.perf_counter()
for i in tqdm(range(test_iter), desc="Testing"):
CPUInfer.submit(
moes[i % layer_num].forward_task(
qlen_tensor.data_ptr(),
num_experts_per_tok,
expert_ids[i%gen_iter].data_ptr(),
weights[i%gen_iter].data_ptr(),
expert_ids[i % gen_iter].data_ptr(),
weights[i % gen_iter].data_ptr(),
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
False
False,
)
)
CPUInfer.sync()
@@ -264,17 +281,29 @@ def bench_moe(quant_mode: str):
# 计算性能指标
time_per_iter_us = total_time / test_iter * 1e6
bandwidth = hidden_size * intermediate_size * 3 * num_experts_per_tok * (1/8 * 256 * (1-(31/32)**qlen)) * bytes_per_elem * test_iter / total_time / 1e9 # 单位GB/s
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 # 单位TFLOPS
bandwidth = (
hidden_size
* intermediate_size
* 3
* num_experts_per_tok
* (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
* bytes_per_elem
* test_iter
/ total_time
/ 1e9
) # 单位GB/s
flops = (
hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
) # 单位TFLOPS
# 打印结果
print('Quant mode:', quant_mode)
print('Time(s):', total_time)
print('Iteration:', test_iter)
print('Time(us) per iteration:', time_per_iter_us)
print('Bandwidth:', bandwidth, 'GB/s')
print('TFLOPS:', flops)
print('')
print("Quant mode:", quant_mode)
print("Time(s):", total_time)
print("Iteration:", test_iter)
print("Time(us) per iteration:", time_per_iter_us)
print("Bandwidth:", bandwidth, "GB/s")
print("TFLOPS:", flops)
print("")
# 整理测试结果
result = {
@@ -298,8 +327,8 @@ def bench_moe(quant_mode: str):
"qlen": qlen,
"warm_up_iter": warm_up_iter,
"test_iter": test_iter,
"CPUInfer_parameter": CPUINFER_PARAM
}
"CPUInfer_parameter": CPUINFER_PARAM,
},
}
# 添加 git 与系统信息
result.update(get_git_commit())
@@ -321,4 +350,4 @@ if __name__ == "__main__":
# bench_moe("q3_k_m", layer_num)
# bench_moe("q2_k", layer_num)
# bench_moe("iq3_xs", layer_num)
# bench_moe("iq2_xxs", layer_num)
# bench_moe("iq2_xxs", layer_num)

View File

@@ -15,7 +15,7 @@ from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import numpy as np
# 测试参数设置

View File

@@ -1,19 +1,20 @@
#!/usr/bin/env python
# coding=utf-8
'''
Description :
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys, time, json, subprocess, platform
from tqdm import tqdm
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
import kt_kernel_ext
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
from kt_kernel import kt_kernel_ext
import torch
import numpy as np
@@ -21,33 +22,28 @@ import numpy as np
expert_num = 256
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
max_len = 25600
num_experts_per_tok = 8
layer_num = 4
qlen = 1024
# qlen = 1
# qlen = 1
warm_up_iter = 1000
test_iter = 5000
k_group_size = 128
physical_to_logical_map = torch.tensor(
data=range(expert_num),
device="cpu",
dtype=torch.int64
).contiguous()
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
# 将 CPUInfer 参数设为变量
# CPUINFER_PARAM = 257
# CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)
worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
worker_config.subpool_numa_map= [0,1]
worker_config.subpool_thread_count = [40,40]
worker_config.subpool_numa_map = [0, 1]
worker_config.subpool_thread_count = [40, 40]
CPUINFER_PARAM = 80
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
def get_git_commit():
"""
获取当前 git 提交记录commit hash 和提交信息),
@@ -82,14 +78,14 @@ def get_system_info():
info = {}
# 系统名称及主机名
uname = platform.uname()
info["system_name"] = uname.system # 如 Linux, Windows 等
info["node_name"] = uname.node # 主机名称
info["system_name"] = uname.system # 如 Linux, Windows 等
info["node_name"] = uname.node # 主机名称
# 获取 CPU 型号(仅 Linux 支持)
cpu_model = None
if os.path.exists('/proc/cpuinfo'):
if os.path.exists("/proc/cpuinfo"):
try:
with open('/proc/cpuinfo', 'r') as f:
with open("/proc/cpuinfo", "r") as f:
for line in f:
if "model name" in line:
cpu_model = line.split(":", 1)[1].strip()
@@ -100,9 +96,9 @@ def get_system_info():
# 获取内存大小单位GB仅 Linux 支持
mem_total_gb = None
if os.path.exists('/proc/meminfo'):
if os.path.exists("/proc/meminfo"):
try:
with open('/proc/meminfo', 'r') as f:
with open("/proc/meminfo", "r") as f:
for line in f:
if "MemTotal" in line:
mem_kb = float(line.split(":", 1)[1].split()[0])
@@ -130,11 +126,13 @@ def get_system_info():
return info
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
script_name = os.path.splitext(os.path.basename(script_path))[0]
json_path = os.path.join(script_dir, script_name + ".jsonl")
def record_results(result, filename=json_path):
"""
将结果以 JSON 格式追加到文件中
@@ -142,6 +140,7 @@ def record_results(result, filename=json_path):
with open(filename, "a") as f:
f.write(json.dumps(result) + "\n")
def bench_moe(quant_mode: str):
with torch.inference_mode():
if quant_mode == "bf16":
@@ -160,11 +159,22 @@ def bench_moe(quant_mode: str):
up_projs = []
down_projs = []
for layer_index in range(layer_num):
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size,0)
gate_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
up_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
down_proj = (
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
config.max_len = max_len
config.gate_proj = gate_proj.data_ptr()
config.up_proj = up_proj.data_ptr()
@@ -189,10 +199,22 @@ def bench_moe(quant_mode: str):
down_projs.append(down_proj)
moes.append(moe)
gen_iter = 3000
expert_ids = torch.rand(gen_iter * qlen , expert_num, device="cpu").argsort(dim=-1)[:, :num_experts_per_tok].reshape(gen_iter, qlen * num_experts_per_tok).to("cpu").contiguous()
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
expert_ids = (
torch.rand(gen_iter * qlen, expert_num, device="cpu")
.argsort(dim=-1)[:, :num_experts_per_tok]
.reshape(gen_iter, qlen * num_experts_per_tok)
.to("cpu")
.contiguous()
)
weights = (
torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
)
input_tensor = (
torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
)
output_tensor = (
torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
)
bsz_tensor = torch.tensor([qlen], device="cpu")
# 预热迭代
@@ -203,8 +225,8 @@ def bench_moe(quant_mode: str):
moes[i % layer_num].forward_task(
bsz_tensor.data_ptr(),
num_experts_per_tok,
expert_ids[i%gen_iter].data_ptr(),
weights[i%gen_iter].data_ptr(),
expert_ids[i % gen_iter].data_ptr(),
weights[i % gen_iter].data_ptr(),
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
False,
@@ -224,8 +246,8 @@ def bench_moe(quant_mode: str):
moes[i % layer_num].forward_task(
bsz_tensor.data_ptr(),
num_experts_per_tok,
expert_ids[i%gen_iter].data_ptr(),
weights[i%gen_iter].data_ptr(),
expert_ids[i % gen_iter].data_ptr(),
weights[i % gen_iter].data_ptr(),
input_tensor[i % layer_num].data_ptr(),
output_tensor[i % layer_num].data_ptr(),
False,
@@ -239,16 +261,28 @@ def bench_moe(quant_mode: str):
# 计算性能指标
time_per_iter_us = total_time / test_iter * 1e6
bandwidth = hidden_size * intermediate_size * 3 * num_experts_per_tok * (1/8 * 256 * (1-(31/32)**qlen)) * bytes_per_elem * test_iter / total_time / 1e9 # 单位GB/s
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 # 单位TFLOPS
bandwidth = (
hidden_size
* intermediate_size
* 3
* num_experts_per_tok
* (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
* bytes_per_elem
* test_iter
/ total_time
/ 1e9
) # 单位GB/s
flops = (
hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
) # 单位TFLOPS
print('Quant mode: ', quant_mode)
print('Time(s): ', total_time)
print('Iteration: ', test_iter)
print('Time(us) per iteration: ', time_per_iter_us)
print('Bandwidth: ', bandwidth, 'GB/s')
print('Flops: ', flops, 'TFLOPS')
print('')
print("Quant mode: ", quant_mode)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", time_per_iter_us)
print("Bandwidth: ", bandwidth, "GB/s")
print("Flops: ", flops, "TFLOPS")
print("")
# 整理结果记录,包括测试参数
result = {
@@ -270,8 +304,8 @@ def bench_moe(quant_mode: str):
"warm_up_iter": warm_up_iter,
"test_iter": test_iter,
"CPUInfer_parameter": CPUINFER_PARAM,
"k_group_size": k_group_size
}
"k_group_size": k_group_size,
},
}
# 添加 git 提交记录信息
result.update(get_git_commit())
@@ -280,9 +314,10 @@ def bench_moe(quant_mode: str):
# 将结果以 JSON 形式追加到文件中
record_results(result)
if __name__ == "__main__":
# 选择需要测试的量化模式
# bench_moe("bf16")
# bench_moe("int8")
# bench_moe("int4")
bench_moe("int4_1k")
bench_moe("int4_1k")

View File

@@ -14,7 +14,7 @@ import os, sys, time, json, subprocess, platform
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import numpy as np
from tqdm import tqdm

View File

@@ -26,7 +26,7 @@ os.environ.setdefault("BLAS_NUM_THREADS", "1")
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import torch # noqa: E402
import kt_kernel_ext as ce # noqa: E402
from kt_kernel import kt_kernel_ext as ce # noqa: E402
from tqdm import tqdm # noqa: E402

View File

@@ -13,7 +13,7 @@ import os, sys, time, json, subprocess, platform
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
import numpy as np
from tqdm import tqdm

View File

@@ -1,9 +1,10 @@
import os
import sys
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import torch
import ctypes
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.moe import MOEConfig, MOE, AMXBF16_MOE, AMXInt8_MOE, AMXInt4_MOE, AMXInt4_1_MOE
intermediate_size_full = 2048
@@ -14,20 +15,14 @@ num_experts_per_tok = 8
cpu_infer = kt_kernel_ext.CPUInfer(97)
up = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch.bfloat16, device="cpu")
gate = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch.bfloat16, device="cpu")
down = torch.empty(experts_num, hidden_size, intermediate_size_full, dtype=torch.bfloat16, device="cpu")
gate_ptr = ctypes.addressof(
ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
)
up_ptr = ctypes.addressof(
ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
)
down_ptr = ctypes.addressof(
ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
)
gate_ptr = ctypes.addressof(ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
up_ptr = ctypes.addressof(ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
down_ptr = ctypes.addressof(ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
moe_config = MOEConfig(
experts_num,
num_experts_per_tok,
@@ -36,9 +31,9 @@ moe_config = MOEConfig(
)
moe_config.layer_idx = 45
moe_config.pool = cpu_infer.backend_
moe_config.max_len = 1024 #TODO(zbx): multi cuda graph
moe_config.max_len = 1024 # TODO(zbx): multi cuda graph
moe_config.gate_proj = gate_ptr
moe_config.up_proj = up_ptr
moe_config.down_proj = down_ptr
moe_config.path = ""
moe = AMXInt4_MOE(moe_config)
moe = AMXInt4_MOE(moe_config)

View File

@@ -1,19 +1,19 @@
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from flash_attn import flash_attn_with_kvcache
import torch
@@ -59,19 +59,11 @@ with torch.inference_mode(mode=True):
local_kvcache = kt_kernel_ext.kvcache.KVCache(config)
kvcaches = []
block_table = (
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
.contiguous()
.view(1, -1)
)
block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1)
for layer_idx in range(layer_num):
k_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
v_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
k_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
v_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
CPUInfer.submit(
local_kvcache.update_kvcache_fp16(
@@ -94,17 +86,11 @@ with torch.inference_mode(mode=True):
k_cache = kvcaches[i % layer_num][0]
v_cache = kvcaches[i % layer_num][1]
input = torch.randn(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
output = torch.empty(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
# attn_lse: (bsz, q_len, q_head_num)
attn_lse = torch.empty(
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
).contiguous()
attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous()
input = input / 100
CPUInfer.submit(
@@ -135,8 +121,6 @@ with torch.inference_mode(mode=True):
)
# print("torch output", t_output)
diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
torch.abs(t_output)
)
diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(torch.abs(t_output))
print("diff = ", diff)
assert diff < 0.001

View File

@@ -2,7 +2,7 @@ import os, sys
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
# Set fixed seed for reproducible results

View File

@@ -1,8 +1,9 @@
import os, sys
import time
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
@@ -20,6 +21,7 @@ from transformers import (
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
# load_layers = 6
load_layers = None
CPUInfer = kt_kernel_ext.CPUInfer(304)
@@ -284,22 +286,21 @@ def build_moegate(layer_idx, json_config, gguf_weights):
json_config["topk_group"],
)
config.routed_scaling_factor = json_config['routed_scaling_factor']
config.routed_scaling_factor = json_config["routed_scaling_factor"]
config.pool = CPUInfer.backend_
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
config.weight = weight.data_ptr()
config.weight_type = type_to_ggml_type(weight_type)
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
config.e_score_correction_bias = bias.data_ptr()
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
gate = kt_kernel_ext.gate.MoEGate(config)
return gate
def build_llm(json_config, gguf_weights):
@@ -312,15 +313,15 @@ def build_llm(json_config, gguf_weights):
general_config.n_shared_experts = json_config["n_shared_experts"]
general_config.max_qlen = max_qlen
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
general_config.lm_heads_ptr = lm_heads.data_ptr()
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
general_config.norm_weights_ptr = output_norm.data_ptr()
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
general_config.token_embd_ptr = token_embd.data_ptr()
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
@@ -330,12 +331,11 @@ def build_llm(json_config, gguf_weights):
model = kt_kernel_ext.DeepseekV3Model(general_config)
llm.model = model
decoder_layers = []
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
for i in range(real_load_layers):
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
@@ -351,11 +351,11 @@ def build_llm(json_config, gguf_weights):
layer.ffn = build_ffn(i, json_config, gguf_weights)
decoder_layers.append(layer)
model.layers = decoder_layers
model.layers = decoder_layers
return llm
safetensor_path = '/home/bd/models/DeepSeek-R1'
safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)
@@ -368,11 +368,11 @@ weights = dict(sorted(weights.items()))
for name, t in weights.items():
# if not name.startswith("blk"):
# if name.startswith("blk.10."):
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print("Building LLM ...")
print("Building LLM ...")
load_start_time = time.perf_counter()
llm = build_llm(json_config, weights)
load_end_time = time.perf_counter()
@@ -389,22 +389,20 @@ config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
force_think = False
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
def start_chat(content=None):
if content is None:
content = input("Chat: ")
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
if force_think:
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
input_tensor = torch.cat(
[input_tensor, token_thinks], dim=1
token_thinks = torch.tensor(
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
)
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
input_tensor = input_tensor.squeeze(0) # Add batch dimension
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
@@ -415,34 +413,36 @@ def start_chat(content=None):
stream = TextStreamer(tokenizer)
qlen = input_tensor.shape[0]
qlens = [qlen-kvlen]
qlens = [qlen - kvlen]
kvlens = [kvlen]
page_tables = [list(range(pages_count))]
start_time = time.perf_counter()
llm.forward(qlens,page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
llm.forward(qlens, page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
end_time = time.perf_counter()
print(f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec")
print(
f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
)
logits = output_logits[0]
# print(logits)
# sample
# sample
next_token = torch.argmax(logits).item()
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
kvlen = input_tensor.shape[0]
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
stream.end()
break
else:
stream.put(torch.tensor([next_token]))
job_id = 0
while True:
try:
# ---------- 让用户决定是否继续 ----------
choice = input(
"\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: "
).strip().lower()
choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower()
if choice in {"q", "quit", "exit"}:
print("收到退出指令,程序结束。")
break
@@ -466,15 +466,4 @@ while True:
print(f"\n发生错误:{e}\n已终止对话 #{job_id},马上重启…")
logger.error(f"Error in job {job_id}: {e}", exc_info=True)
finally:
job_id += 1 # 不管中断与否,都给下一任务换编号
job_id += 1 # 不管中断与否,都给下一任务换编号

View File

@@ -1,8 +1,9 @@
import os, sys
import time
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
@@ -188,7 +189,6 @@ def build_mla(layer_idx, json_config, gguf_weights):
config.layer_idx = layer_idx
config.pool = CPUInfer.backend_
config.page_count = pages_count
if q_a_type == "F32":
mla = kt_kernel_ext.mla.MLA_F32(config)
@@ -284,22 +284,21 @@ def build_moegate(layer_idx, json_config, gguf_weights):
json_config["topk_group"],
)
config.routed_scaling_factor = json_config['routed_scaling_factor']
config.routed_scaling_factor = json_config["routed_scaling_factor"]
config.pool = CPUInfer.backend_
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
config.weight = weight.data_ptr()
config.weight_type = type_to_ggml_type(weight_type)
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
config.e_score_correction_bias = bias.data_ptr()
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
gate = kt_kernel_ext.gate.MoEGate(config)
return gate
def build_llm(json_config, gguf_weights):
@@ -312,15 +311,15 @@ def build_llm(json_config, gguf_weights):
general_config.n_shared_experts = json_config["n_shared_experts"]
general_config.max_qlen = max_qlen
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
general_config.lm_heads_ptr = lm_heads.data_ptr()
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
general_config.norm_weights_ptr = output_norm.data_ptr()
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
general_config.token_embd_ptr = token_embd.data_ptr()
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
@@ -330,12 +329,11 @@ def build_llm(json_config, gguf_weights):
model = kt_kernel_ext.DeepseekV3Model(general_config)
llm.model = model
decoder_layers = []
for i in range(json_config["num_hidden_layers"]):
# for i in range(6):
# for i in [0,1,2,3,4,5,6,7,8,9,10]:
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
# for i in range(6):
# for i in [0,1,2,3,4,5,6,7,8,9,10]:
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
@@ -351,11 +349,11 @@ def build_llm(json_config, gguf_weights):
layer.ffn = build_ffn(i, json_config, gguf_weights)
decoder_layers.append(layer)
model.layers = decoder_layers
model.layers = decoder_layers
return llm
safetensor_path = '/home/bd/models/DeepSeek-R1'
safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)
@@ -368,8 +366,8 @@ weights = dict(sorted(weights.items()))
for name, t in weights.items():
# if not name.startswith("blk"):
# if name.startswith("blk.10."):
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print("Building LLM ...")
llm = build_llm(json_config, weights)
@@ -384,7 +382,7 @@ prompt_file = None
force_think = False
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
def start_chat():
@@ -411,16 +409,14 @@ def start_chat():
content = "Please write a piece of quicksort code in C++."
elif os.path.isfile(content):
content = open(content, "r").read()
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
if force_think:
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
input_tensor = torch.cat(
[input_tensor, token_thinks], dim=1
token_thinks = torch.tensor(
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
)
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
input_tensor = input_tensor.squeeze(0) # Add batch dimension
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
@@ -431,28 +427,27 @@ def start_chat():
qlens = [qlen]
kvlens = [0]
page_tables = [list(range(pages_count))]
llm.forward(qlens,page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
logits = output_logits[0]
# print(logits)
# sample
# sample
next_token = torch.argmax(logits).item()
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
print(stream.end(), end="", flush=True)
break
else:
print(stream.put(torch.tensor([next_token])), end="", flush=True)
job_id = 0
while True:
try:
# ---------- 让用户决定是否继续 ----------
choice = input(
"\n【回车】开始对话 | 输入 q/quit/exit 退出程序: "
).strip().lower()
choice = input("\n【回车】开始对话 | 输入 q/quit/exit 退出程序: ").strip().lower()
if choice in {"q", "quit", "exit"}:
print("收到退出指令,程序结束。")
break
@@ -464,15 +459,4 @@ while True:
# 随时 Ctrl-C放弃当前任务并重启
print(f"\n检测到 Ctrl-C已终止对话 #{job_id},马上重启…")
finally:
job_id += 1 # 不管中断与否,都给下一任务换编号
job_id += 1 # 不管中断与否,都给下一任务换编号

View File

@@ -1,8 +1,9 @@
import os, sys
import time
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
@@ -20,12 +21,13 @@ from transformers import (
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
# load_layers = 3
load_layers = None
worker_config = kt_kernel_ext.WorkerPoolConfig()
worker_config.subpool_count = 2
worker_config.subpool_numa_map= [0,1]
worker_config.subpool_thread_count = [72,72]
worker_config.subpool_numa_map = [0, 1]
worker_config.subpool_thread_count = [72, 72]
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
max_qlen = 4096
@@ -289,22 +291,21 @@ def build_moegate(layer_idx, json_config, gguf_weights):
json_config["topk_group"],
)
config.routed_scaling_factor = json_config['routed_scaling_factor']
config.routed_scaling_factor = json_config["routed_scaling_factor"]
config.pool = CPUInfer.backend_
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
config.weight = weight.data_ptr()
config.weight_type = type_to_ggml_type(weight_type)
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
config.e_score_correction_bias = bias.data_ptr()
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
gate = kt_kernel_ext.gate.MoEGate(config)
return gate
def build_llm(json_config, gguf_weights):
@@ -317,15 +318,15 @@ def build_llm(json_config, gguf_weights):
general_config.n_shared_experts = json_config["n_shared_experts"]
general_config.max_qlen = max_qlen
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
general_config.lm_heads_ptr = lm_heads.data_ptr()
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
general_config.norm_weights_ptr = output_norm.data_ptr()
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
general_config.token_embd_ptr = token_embd.data_ptr()
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
@@ -335,13 +336,12 @@ def build_llm(json_config, gguf_weights):
model = kt_kernel_ext.DeepseekV3Model(general_config)
llm.model = model
decoder_layers = []
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
for i in range(real_load_layers):
# for i in [2,3]:
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
# for i in [2,3]:
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
@@ -357,11 +357,11 @@ def build_llm(json_config, gguf_weights):
layer.ffn = build_ffn(i, json_config, gguf_weights)
decoder_layers.append(layer)
model.layers = decoder_layers
model.layers = decoder_layers
return llm
safetensor_path = '/home/bd/models/DeepSeek-R1'
safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)
@@ -372,13 +372,13 @@ weights = dict(sorted(weights.items()))
# for name, t in weights.items():
# if not name.startswith("blk"):
# if name.startswith("blk.10."):
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print("Building LLM ...")
# if not name.startswith("blk"):
# if name.startswith("blk.10."):
# if "ffn_gate." in name:
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
print("Building LLM ...")
load_start_time = time.perf_counter()
llm = build_llm(json_config, weights)
load_end_time = time.perf_counter()
@@ -395,22 +395,20 @@ config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
force_think = False
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
def start_chat(content=None):
if content is None:
content = input("Chat: ")
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
if force_think:
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
input_tensor = torch.cat(
[input_tensor, token_thinks], dim=1
token_thinks = torch.tensor(
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
)
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
input_tensor = input_tensor.squeeze(0) # Add batch dimension
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
@@ -425,30 +423,32 @@ def start_chat(content=None):
kvlens = [0]
page_tables = [list(range(pages_count))]
start_time = time.perf_counter()
llm.forward(qlens,page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
end_time = time.perf_counter()
print(f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec")
print(
f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
)
logits = output_logits[0]
# print(logits)
# sample
# sample
next_token = torch.argmax(logits).item()
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
# kvlen = input_tensor.shape[0]
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
stream.end()
break
else:
stream.put(torch.tensor([next_token]))
job_id = 0
while True:
try:
# ---------- 让用户决定是否继续 ----------
choice = input(
"\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: "
).strip().lower()
choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower()
if choice in {"q", "quit", "exit"}:
print("收到退出指令,程序结束。")
break
@@ -472,15 +472,4 @@ while True:
print(f"\n发生错误:{e}\n已终止对话 #{job_id},马上重启…")
logger.error(f"Error in job {job_id}: {e}", exc_info=True)
finally:
job_id += 1 # 不管中断与否,都给下一任务换编号
job_id += 1 # 不管中断与否,都给下一任务换编号

View File

@@ -1,15 +1,17 @@
import math
import os,sys
import os, sys
import time
from typing import Optional
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import nn
import torch.nn.functional as F
# from modeling_deepseek_v3 import MoEGate
from configuration_deepseek_v3 import DeepseekV3Config
@@ -28,17 +30,20 @@ n_group = config.n_group
topk_group = config.topk_group
routed_scaling_factor = config.routed_scaling_factor
weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to('cpu').contiguous()
bias = torch.randn((n_routed_experts,), dtype=torch.float32).to('cpu').contiguous()
weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to("cpu").contiguous()
bias = torch.randn((n_routed_experts,), dtype=torch.float32).to("cpu").contiguous()
# weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float16).to('cpu').contiguous ()
def load_fp32_tensor(file_path, shape):
return torch.zeros(shape, dtype=torch.float32).to('cpu').contiguous()
with open(file_path, 'rb') as f:
return torch.zeros(shape, dtype=torch.float32).to("cpu").contiguous()
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
class MoEGate(nn.Module):
def __init__(self, config):
super().__init__()
@@ -54,13 +59,9 @@ class MoEGate(nn.Module):
# topk selection algorithm
self.norm_topk_prob = config.norm_topk_prob
self.gating_dim = config.hidden_size
self.weight = nn.Parameter(
torch.empty((self.n_routed_experts, self.gating_dim))
)
self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
if self.topk_method == "noaux_tc":
self.e_score_correction_bias = nn.Parameter(
torch.empty((self.n_routed_experts))
)
self.e_score_correction_bias = nn.Parameter(torch.empty((self.n_routed_experts)))
self.reset_parameters()
def reset_parameters(self) -> None:
@@ -73,93 +74,88 @@ class MoEGate(nn.Module):
### compute gating score
hidden_states = hidden_states.view(-1, h)
h_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input',(seq_len,h))
h_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input", (seq_len, h)
)
diff = (h_to_check - hidden_states).abs().max()
# print("hidden_states diff:", diff)
# assert diff<0.02
bias_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias',(n_routed_experts))
bias_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias", (n_routed_experts)
)
diff = (bias - bias_to_check).abs().max()
# print('bias diff:',diff)
# assert diff < 0.02
logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
logits = F.linear(
hidden_states.type(torch.float32), self.weight.type(torch.float32), None
logits_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits",
(seq_len, n_routed_experts),
)
logits_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits',(seq_len,n_routed_experts))
diff = (logits_to_check - logits).abs().max()
# print("logits diff:", diff)
# assert diff < 0.02
if self.scoring_func == "sigmoid":
scores = logits.sigmoid()
else:
raise NotImplementedError(
f"insupportable scoring function for MoE gating: {self.scoring_func}"
)
raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")
### select top-k experts
if self.topk_method == "noaux_tc":
# assert not self.training
scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice',(seq_len,n_routed_experts))
scores_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice",
(seq_len, n_routed_experts),
)
diff = (scores_for_choice - scores_to_check).abs().max()
print(f'score for choice diff = {diff}')
print(f"score for choice diff = {diff}")
group_scores = (
scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
) # [n, n_group]
group_scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores',(seq_len,n_group))
group_scores_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores",
(seq_len, n_group),
)
diff = (group_scores - group_scores_to_check).abs().max()
print(f'group scores diff = {diff}')
print(f"group scores diff = {diff}")
group_idx = torch.topk(
group_scores, k=self.topk_group, dim=-1, sorted=False
)[
1
] # [n, top_k_group]
group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] # [n, top_k_group]
group_mask = torch.zeros_like(group_scores) # [n, n_group]
group_mask.scatter_(1, group_idx, 1) # [n, n_group]
score_mask = (
group_mask.unsqueeze(-1)
.expand(
bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
)
.expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
.reshape(bsz * seq_len, -1)
) # [n, e]
tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e]
tmp_scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped',(seq_len,n_routed_experts))
is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True)
print(f'tmp_score ok {is_close.all()}')
_, topk_idx = torch.topk(
tmp_scores, k=self.top_k, dim=-1, sorted=False
tmp_scores_to_check = load_fp32_tensor(
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped",
(seq_len, n_routed_experts),
)
is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True)
print(f"tmp_score ok {is_close.all()}")
_, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
topk_weight = scores.gather(1, topk_idx)
else:
raise NotImplementedError(
f"insupportable TopK function for MoE gating: {self.topk_method}"
)
raise NotImplementedError(f"insupportable TopK function for MoE gating: {self.topk_method}")
### norm gate to sum 1
if self.top_k > 1 and self.norm_topk_prob:
denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
topk_weight = topk_weight / denominator
topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
return topk_idx, topk_weight
def torch_gate(hidden_states):
hidden_states.unsqueeze_(0)
gate = MoEGate(config)
@@ -172,11 +168,11 @@ def torch_gate(hidden_states):
def cpuinfer_gate(hidden_states):
config = kt_kernel_ext.gate.GateConfig(
hidden_size,
num_experts_per_token,
n_routed_experts,
n_group,
topk_group,
hidden_size,
num_experts_per_token,
n_routed_experts,
n_group,
topk_group,
)
CPUInfer = kt_kernel_ext.CPUInfer(64)
@@ -188,32 +184,29 @@ def cpuinfer_gate(hidden_states):
config.e_score_correction_bias = bias.data_ptr()
config.e_score_correction_bias_type = ggml_type.FP32
gate = kt_kernel_ext.gate.MoEGate(config)
gate = kt_kernel_ext.gate.MoEGate(config)
expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to("cpu").contiguous()
expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to("cpu").contiguous()
expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to('cpu').contiguous()
expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to('cpu').contiguous()
gate.forward(seqlen,hidden_states.data_ptr(),expert_ids.data_ptr(), expert_weights.data_ptr())
gate.forward(seqlen, hidden_states.data_ptr(), expert_ids.data_ptr(), expert_weights.data_ptr())
# print(expert_ids,expert_weights)
return expert_ids, expert_weights
input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to('cpu').contiguous()
input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to("cpu").contiguous()
# print(input)
ids,we = cpuinfer_gate(input)
ids, we = cpuinfer_gate(input)
idx = torch.argsort(ids, dim=-1, descending=True)
ids = torch.gather(ids,dim=-1,index=idx)
we = torch.gather(we,dim=-1,index=idx)
ids = torch.gather(ids, dim=-1, index=idx)
we = torch.gather(we, dim=-1, index=idx)
std_ids,std_we= torch_gate(input)
std_ids, std_we = torch_gate(input)
idx = torch.argsort(std_ids, dim=-1, descending=True)
std_we = torch.gather(std_we,dim=-1,index=idx)
std_ids = torch.gather(std_ids,dim=-1,index=idx)
std_we = torch.gather(std_we, dim=-1, index=idx)
std_ids = torch.gather(std_ids, dim=-1, index=idx)
# print("ids diff:", torch.abs(std_ids - ids).max())
@@ -221,28 +214,3 @@ std_ids = torch.gather(std_ids,dim=-1,index=idx)
assert torch.abs(std_ids - ids).max() == 0, "Expert IDs do not match!"
assert torch.abs(std_we - we).max() < 1e-2, "Expert Weights do not match!"
print("Expert IDs and Weights match successfully!")

View File

@@ -6,7 +6,7 @@ from typing import Dict, Literal
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
torch.manual_seed(42)
@@ -132,6 +132,7 @@ def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: Literal[0, 1]
return packed
def pack_tensor_per_row(q: torch.Tensor, num_bits: int) -> torch.Tensor:
e, rows, cols = q.shape
flat = q.view(e * rows, cols)
@@ -283,9 +284,9 @@ def run_case(pattern: str) -> Dict[str, float]:
CPUInfer.sync()
input_tensor_fp16 = input_tensor.to(torch.float16)
t_output = moe_torch(
input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16
).to(torch.bfloat16)
t_output = moe_torch(input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16).to(
torch.bfloat16
)
t_output = t_output.flatten()
output = output.flatten()

View File

@@ -11,7 +11,7 @@ import numpy as np
# if REPO_ROOT not in sys.path:
# sys.path.insert(0, REPO_ROOT)
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
from kt_kernel_ext import CPUInfer
@@ -57,10 +57,10 @@ def allocate_weights(expert_num, hidden_size, intermediate_size, group_size):
def main():
torch.manual_seed(123)
expert_num = 256 # Total experts
expert_num = 256 # Total experts
gpu_experts = expert_num # Number of experts on GPU
gpu_tp_count = 2 # Number of TP parts
num_experts_per_tok = 8
hidden_size = 7168
intermediate_size = 2048
@@ -89,9 +89,7 @@ def main():
moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(cfg)
physical_to_logical_map = (
torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
)
physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
cpuinfer.sync()
@@ -169,6 +167,7 @@ def main():
total_bytes = total_weights // group_size + total_weights // 2
print(f"write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms")
print(f"Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s")
def split_expert_tensor(tensor, chunk):
"""Split tensor by experts"""
return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)]
@@ -229,10 +228,10 @@ def main():
tp_scale_offset = col_scale_start + tp_idx * tp_slice_scale_size
down_weight_tp_parts.append(
down_q_experts[expert_idx][tp_weight_offset:tp_weight_offset + tp_slice_weight_size]
down_q_experts[expert_idx][tp_weight_offset : tp_weight_offset + tp_slice_weight_size]
)
down_scale_tp_parts.append(
down_scale_experts[expert_idx][tp_scale_offset:tp_scale_offset + tp_slice_scale_size]
down_scale_experts[expert_idx][tp_scale_offset : tp_scale_offset + tp_slice_scale_size]
)
# Concatenate all column slices for this TP
@@ -260,7 +259,9 @@ def main():
assert torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight), f"w2 weight bytes mismatch for TP {tp_idx}"
assert torch.allclose(w2_scale_bufs[tp_idx], expected_w2_scale), f"w2 scale values mismatch for TP {tp_idx}"
print(f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts")
print(
f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts"
)
if __name__ == "__main__":

View File

@@ -1,26 +1,27 @@
#!/usr/bin/env python
# coding=utf-8
'''
Description :
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:36:59
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
input_size = 16384
output_size = 5120
stride = 32
group_max_len = 1024
proj_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
proj_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
qlen = 30
layer_num = 10
CPUInfer = kt_kernel_ext.CPUInfer(48)
@@ -30,8 +31,10 @@ with torch.inference_mode(mode=True):
linears = []
projs = []
for _ in range(layer_num):
proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
config = kt_kernel_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
proj = torch.randn((output_size, input_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
config = kt_kernel_ext.linear.LinearConfig(
input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type
)
linear = kt_kernel_ext.linear.Linear(config)
projs.append(proj)
linears.append(linear)
@@ -43,20 +46,14 @@ with torch.inference_mode(mode=True):
output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous()
input = input / 100
CPUInfer.submit(
linear.forward(
qlen,
input.data_ptr(),
output.data_ptr()
)
)
CPUInfer.submit(linear.forward(qlen, input.data_ptr(), output.data_ptr()))
CPUInfer.sync()
# print('cpuinfer output', output)
proj = projs[i%layer_num]
proj = projs[i % layer_num]
t_output = torch.mm(input, proj.t())
# print('torch output', t_output)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print('diff = ', diff)
assert(diff < 0.001)
print("diff = ", diff)
assert diff < 0.001

View File

@@ -1,19 +1,22 @@
import logging
import os,sys
import os, sys
import time
from typing import Optional
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
def read_gguf_file(gguf_file_path):
"""
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
@@ -46,12 +49,15 @@ def read_gguf_file(gguf_file_path):
re.append(tensor)
return re
def get_torch_tensor_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous()
def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name
def type_to_ggml_type(type):
if type == "F32":
return ggml_type.FP32
@@ -70,12 +76,12 @@ seed = 42 # 你可以选择任何整数作为种子
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
qlen = 3212
qlen = 3212
kvlen = 0
page_table = range(20)
bsz_tensors=torch.tensor([1])
bsz_tensors = torch.tensor([1])
page_size = 256
@@ -94,8 +100,7 @@ rope_theta = 10000
max_qlen = 4096
max_kvlen = 4096
max_position_embeddings = 163840
max_position_embeddings = 163840
rope_scaling = {
@@ -105,11 +110,10 @@ rope_scaling = {
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
"type": "yarn",
}
CPUInfer = kt_kernel_ext.CPUInfer(30)
validation_iter = 100
@@ -119,15 +123,16 @@ weight_type = torch.bfloat16
# weight_type = torch.float16
input_type = {torch.float32:torch.float32,
torch.float16:torch.float16,
torch.bfloat16:torch.float32,
}[weight_type]
input_type = {
torch.float32: torch.float32,
torch.float16: torch.float16,
torch.bfloat16: torch.float32,
}[weight_type]
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type)
q_a_norm = torch.ones(hidden_size, dtype=torch.float32)
kv_a_norm = torch.ones(hidden_size, dtype=torch.float32)
@@ -190,7 +195,7 @@ if use_real_weights := True:
o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
o_proj.weight = nn.Parameter(o_proj_weight.view(torch.bfloat16), requires_grad=False)
else:
init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
@@ -203,16 +208,16 @@ q_absorb = x_reshaped[:, 0]
out_absorb = x_reshaped[:, 1]
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
def test_cpu_mla():
os.environ["BLAS_NUM_THREADS"] = "1"
q_a_proj_weight = q_a_proj.weight.to(weight_type).to('cpu').contiguous()
q_b_proj_weight = q_b_proj.weight.to(weight_type).to('cpu').contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(weight_type).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to('cpu').contiguous()
o_proj_weight = o_proj.weight.to(weight_type).to('cpu').contiguous()
q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous()
config = kt_kernel_ext.mla.MLAConfig(
hidden_size,
@@ -224,7 +229,7 @@ def test_cpu_mla():
)
config.max_qlen = max_qlen
config.max_kvlen = max_kvlen
config.max_position_embeddings = max_position_embeddings
config.max_position_embeddings = max_position_embeddings
config.rope_scaling_factor = rope_scaling["factor"]
config.rope_theta = rope_theta
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
@@ -245,7 +250,6 @@ def test_cpu_mla():
config.kv_a_norm_type = ggml_type.FP32
config.page_count = pages_count
if weight_type == torch.float32:
config.q_a_proj_type = ggml_type.FP32
config.q_b_proj_type = ggml_type.FP32
@@ -267,10 +271,8 @@ def test_cpu_mla():
else:
raise ValueError(f"Unsupported data type: {weight_type}")
config.pool = CPUInfer.backend_
if weight_type == torch.float32:
mla = kt_kernel_ext.mla.MLA_F32(config)
elif weight_type == torch.float16:
@@ -280,54 +282,53 @@ def test_cpu_mla():
mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
else:
raise ValueError(f"Unsupported data type: {weight_type}")
mla.load_weights()
mla.set_local_pages(pages_count)
output = torch.zeros((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
mla.forward([qlen],[page_table],[kvlen],hidden_states.data_ptr(),output.data_ptr())
print("CPU MLA Output: ",output)
output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr())
print("CPU MLA Output: ", output)
return output
def load_fp16_tensor(file_path, shape):
# return load_fp32_tensor(file_path, shape)
return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=weight_type)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
def load_fp32_tensor(file_path, shape):
return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
def test_torch():
torch.set_grad_enabled(False)
softmax_scale = (nope_size + rope_size) ** -0.5
# 1代表的是压缩的kv的头数
k_caches = torch.randn(1,pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type)
k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type)
kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)
q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)
q_a_layernorm.weight = nn.Parameter( q_a_norm,requires_grad=False)
q_a_layernorm.weight = nn.Parameter(q_a_norm, requires_grad=False)
x = torch.randn(q_lora_rank, dtype=weight_type)*100
x = torch.randn(q_lora_rank, dtype=weight_type) * 100
print(x)
print(q_a_layernorm(x))
kv_a_layernorm = DeepseekV2RMSNorm(kv_lora_rank)
kv_a_layernorm.weight = nn.Parameter(kv_a_norm, requires_grad=False)
# 第三步:拆分成两个 tensor
# q_absorb, out_absorb = x_permuted[:, 0], x_permuted[:, 1] # 都是 (num_heads, nope_size, kv_lora_rank
# q_absorb = kv_b_proj[:, ] # torch.randn(num_heads, nope_size, kv_lora_rank, dtype=data_type)
@@ -348,65 +349,64 @@ def test_torch():
# kv_indices 是[0:bsz]page_idx=[0:bsz], page_offset=[kvlen:qlen+kvlen]
# last_page_len = [qlen+kvlen,...] layer_idx = 1
# position_ids = [kvlen:qlen+kvlen]
q_indptr = torch.tensor([0,qlen]).to(torch.int32)
q_indptr = torch.tensor([0, qlen]).to(torch.int32)
kv_indptr = torch.tensor([0,(qlen+kvlen+page_size-1)//page_size]).to(torch.int32)
kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32)
kv_indices = torch.tensor(range(pages_count)).to(torch.int32)
page_idx = torch.tensor([i//page_size for i in range(kvlen,kvlen+qlen)] ).to(torch.int32)
page_offset = torch.tensor( [i%page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
last_page_len = torch.tensor([256], device=hidden_states.device)
position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32)
# 按照行创建 mask [qlen,kvlen+qlen]
attention_masks = torch.zeros((max_qlen, max_kvlen), dtype=weight_type)
for i in range(max_qlen):
attention_masks[i, i + kvlen + 1:] = -inf
attention_masks[i, i + kvlen + 1 :] = -inf
def torch_attn(hidden_states_i: torch.Tensor,
kv_cache: KDeepSeekV3Cache,
position_ids: torch.Tensor,
page_idx: torch.Tensor,
page_offset: torch.Tensor,
attention_masks: Optional[list[torch.Tensor]] = None,
q_indptr: Optional[torch.Tensor] = None,
kv_indices: Optional[torch.Tensor] = None,
kv_indptr: Optional[torch.Tensor] = None,
bsz_tensors: Optional[torch.Tensor] = None,
last_page_len: Optional[torch.Tensor] = None,
layer_idx: Optional[int] = None,
):
def torch_attn(
hidden_states_i: torch.Tensor,
kv_cache: KDeepSeekV3Cache,
position_ids: torch.Tensor,
page_idx: torch.Tensor,
page_offset: torch.Tensor,
attention_masks: Optional[list[torch.Tensor]] = None,
q_indptr: Optional[torch.Tensor] = None,
kv_indices: Optional[torch.Tensor] = None,
kv_indptr: Optional[torch.Tensor] = None,
bsz_tensors: Optional[torch.Tensor] = None,
last_page_len: Optional[torch.Tensor] = None,
layer_idx: Optional[int] = None,
):
global out_absorb
global q_absorb
hidden_states = hidden_states_i.to(weight_type)
# range bsz_tensors
final_attention_output = torch.tensor([], device=hidden_states.device)
for i in range(bsz_tensors[0]):
batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i]
batch_last_page_len = last_page_len[i]
# kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]]
batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]]
# kv_page_nums is the number of pages for the current batch
kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
kv_page_nums = kv_indptr[i + 1] - kv_indptr[i]
# kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
kv_total_len = kv_page_nums * page_size
if batch_last_page_len is not None:
kv_total_len = kv_total_len - (page_size - batch_last_page_len)
# print(f"kv_total_len's shape {kv_total_len.shape}")
# kv_index is the index of the kv cache pages for the current batch
kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]]
# we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
# from q_indptr[i] to q_indptr[i+1] is the range of the current batch
batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]]
batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]]
qlen, _ = batch_hidden_states.size()
# print("qlen -> ", qlen)
hidden_states_to_check = load_fp16_tensor('./debug/query_0_tp_0_input.bin',batch_hidden_states.shape)
hidden_states_to_check = load_fp16_tensor("./debug/query_0_tp_0_input.bin", batch_hidden_states.shape)
diff = torch.abs(batch_hidden_states - hidden_states_to_check).max()
print("hidden_states diff -> ", diff)
@@ -422,8 +422,6 @@ def test_torch():
# print("q_lora mae -> ", mae)
# print("q_lora mae test -> ", mae_test)
q_lora_norm = q_a_layernorm(q_lora)
# q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
# q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
@@ -435,30 +433,25 @@ def test_torch():
# print("q_lora_norm mae -> ", mae)
# print("q_lora_norm diff test -> ", diff_test)
# print("q_lora_norm mae test -> ", mae_test)
q = q_b_proj(q_lora_norm)
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
q = q.view(qlen, num_heads, nope_size+rope_size)
q = q.view(qlen, num_heads, nope_size + rope_size)
# q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
# q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
q_nope, q_pe = torch.split(
q, [nope_size, rope_size], dim=-1
)
q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1)
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
compressed_kv, k_pe = torch.split(
compressed_kv, [kv_lora_rank, rope_size], dim=-1
)
compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1)
compressed_kv = compressed_kv.contiguous()
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
# compressed_kv_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_kv_lora_rank',
# compressed_kv_page_0.shape)
# diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
# print("compressed_kv diff -> ", diff)
# print("compressed_kv mae -> ", mae)
@@ -472,14 +465,11 @@ def test_torch():
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
# print("compressed_kv diff norm -> ", diff)
# print("compressed_kv mae norm -> ", mae)
k_pe = k_pe.view(qlen, 1, rope_size)
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
cos, sin = rotary_emb(q_pe, batch_position_ids)
# q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
@@ -494,8 +484,8 @@ def test_torch():
# print("q_nope[0] mae -> ", mae)
# print("q_nope[0] diff test -> ", diff_test)
# print("q_nope[0] mae test -> ", mae_test)
q_pe_nope = q_pe.transpose(0,1)
q_pe_nope = q_pe.transpose(0, 1)
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe_nope[0].shape)
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope', q_pe_nope[0].shape)
# q_pe_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope_test', q_pe_nope[0].shape)
@@ -534,12 +524,11 @@ def test_torch():
q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
q_pe = q_pe.squeeze(0)
# q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
q_pe.transpose_(0, 1)
q_pe.transpose_(0, 1)
# diff = torch.abs(q_pe - q_new).max()
# print("q_pe diff -> ", diff)
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
# diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
# mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
@@ -552,15 +541,22 @@ def test_torch():
# print("q_pe[0] 2 mae -> ", mae)
if kv_cache is not None:
cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models
compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
cache_kwargs = {
"sin": sin,
"cos": cos,
"page_idx": batch_page_idx,
"page_offset": batch_page_offset,
} # Specific to RoPE models
compressed_kv_with_k_pe = kv_cache.update(
compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs
)
compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
# q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
# out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
# q_absorb, out_absorb = get_absorbed()
# q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)]
q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
# q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope[0].shape)
# diff = torch.abs(q_nope[0] - q_nope_0_to_check).max()
@@ -568,7 +564,7 @@ def test_torch():
# print("q_nope[0] diff -> ", diff)
# q_nope is [num_heads(128), qlen, kv_lora_rank(512)]
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
# k_b_proj_check = load_fp16_tensor('./debug/query_0_tp_0_k_b_lora', (nope_size,kv_lora_rank))
# diff = torch.abs(q_absorb[0] - k_b_proj_check).max()
@@ -594,7 +590,7 @@ def test_torch():
if batch_compressed_kv is None or batch_k_pe is None:
batch_compressed_kv = tmp_compressed_kv
batch_k_pe = tmp_k_pe
else:
else:
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
kv_total_len -= page_size
@@ -604,28 +600,27 @@ def test_torch():
if batch_compressed_kv is None or batch_k_pe is None:
batch_compressed_kv = tmp_compressed_kv
batch_k_pe = tmp_k_pe
else:
else:
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
break
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
# k_pe_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_k_rope', (256,64))
# diff = torch.abs(batch_k_pe[:256] - k_pe_to_check).max()
# mae = torch.mean(torch.abs(batch_k_pe[:256] - k_pe_to_check))
# print("k_pe diff -> ", diff)
# print("k_pe mae -> ", mae)
pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
pe_weights = torch.matmul(q_pe, batch_k_pe.mT)
kv_total_len = kv_page_nums * page_size
# pe_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_pe_attention_weights', (1024,4096))
# pe_weights_0 = pe_weights_0[0:qlen, 0:kv_total_len]
# diff = torch.abs(pe_weights[0] - pe_weights_0).max()
# print("pe_weights[0] diff -> ", diff)
attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))
attention_weights = pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)
# raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096))
# raw_weights = raw_weights[0:qlen, 0:kv_total_len]
@@ -634,47 +629,47 @@ def test_torch():
attention_weights = attention_weights * softmax_scale
# attention_weights is [num_heads(128), qlen, k_len]
# attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)
# attention_masks[i] is [qlen, k_len]
print(attention_weights.shape)
print(attention_masks.shape)
attention_weights = (attention_weights + attention_masks[ :attention_weights.shape[1],:attention_weights.shape[2]])
attention_weights = (
attention_weights + attention_masks[: attention_weights.shape[1], : attention_weights.shape[2]]
)
# attention_weights shape is [num_heads(128), qlen, k_len]
attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype)
attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=weight_type).to(q_pe.dtype)
# attention_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_weights', (1024, 4096))
# attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
# diff = torch.abs(attention_weights[0] - attention_weights_0).max()
# print("attention_weights[0] diff -> ", diff)
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
# o_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_o_absorb', (qlen,kv_lora_rank))
# diff = torch.abs(attn_output[0] - o_absorb_check).max()
# print("o absorb[0] diff -> ", diff)
out_absorb = out_absorb.transpose(1, 2) # [qlen, num_heads(128), v_head_dim(128)]
out_absorb = out_absorb.transpose(1, 2) # [qlen, num_heads(128), v_head_dim(128)]
# q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
# attn_output_check_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_output', (qlen, nope_size))
# diff = torch.abs(attn_output[0] - attn_output_check_0).max()
# print("attn_output[0] diff -> ", diff)
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
attn_output = attn_output.reshape(qlen, num_heads * nope_size)
w_o = o_proj.weight.view([hidden_size,num_heads * nope_size])
output = torch.matmul(attn_output,w_o.transpose(0,1))
w_o = o_proj.weight.view([hidden_size, num_heads * nope_size])
output = torch.matmul(attn_output, w_o.transpose(0, 1))
output = output.view(qlen, hidden_size)
# output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size))
# h1_o = w_o[:,:128]
# local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128))
@@ -685,35 +680,32 @@ def test_torch():
# diff = torch.abs(h1_output - output_0_check).max()
# print("h1_output diff -> ", diff)
# output_check = load_fp16_tensor('./debug/output.bin', output.shape)
# diff = torch.abs(output - output_check).max()
# mae = torch.mean(torch.abs(output - output_check))
# print("output diff -> ", diff)
final_attention_output = torch.cat((final_attention_output, output), dim=0)
return final_attention_output
torch_output = torch_attn(
hidden_states,
kv_cache,
position_ids,
page_idx,
page_offset,
attention_masks=attention_masks,
q_indptr=q_indptr,
kv_indices=kv_indices,
kv_indptr=kv_indptr,
bsz_tensors=bsz_tensors,
last_page_len=last_page_len,
layer_idx=0
)
print("Torch Output: ",torch_output)
hidden_states,
kv_cache,
position_ids,
page_idx,
page_offset,
attention_masks=attention_masks,
q_indptr=q_indptr,
kv_indices=kv_indices,
kv_indptr=kv_indptr,
bsz_tensors=bsz_tensors,
last_page_len=last_page_len,
layer_idx=0,
)
print("Torch Output: ", torch_output)
return torch_output
torch.set_printoptions(sci_mode=False, precision=5)
output_cpu = test_cpu_mla()
output_torch = test_torch()
@@ -724,11 +716,9 @@ diff = (output_cpu - output_torch).abs()
diff_relative = diff / (output_cpu.abs())
# 把 diff_relative 中的 NaN 替换为 0
diff_relative = torch.where(torch.isnan(diff_relative), torch.zeros_like(diff_relative), diff_relative)
diff_relative_mean = torch.mean(torch.abs(output_cpu-output_torch)) / torch.mean(torch.abs(output_torch))
diff_relative_mean = torch.mean(torch.abs(output_cpu - output_torch)) / torch.mean(torch.abs(output_torch))
print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}')
print(
f"Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}"
)
assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"

View File

@@ -1,19 +1,22 @@
import logging
import os,sys
import os, sys
import time
from typing import Optional
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
def read_gguf_file(gguf_file_path):
"""
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
@@ -46,12 +49,15 @@ def read_gguf_file(gguf_file_path):
re.append(tensor)
return re
def get_torch_tensor_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous()
def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name
def type_to_ggml_type(type):
if type == "F32":
return ggml_type.FP32
@@ -75,7 +81,7 @@ kvlen = 0
page_table = range(20)
bsz_tensors=torch.tensor([1])
bsz_tensors = torch.tensor([1])
page_size = 256
@@ -94,8 +100,7 @@ rope_theta = 10000
max_qlen = 1024
max_kvlen = 4096
max_position_embeddings = 163840
max_position_embeddings = 163840
rope_scaling = {
@@ -105,11 +110,10 @@ rope_scaling = {
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
"type": "yarn",
}
CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 100
@@ -119,15 +123,16 @@ weight_type = torch.bfloat16
# weight_type = torch.float16
input_type = {torch.float32:torch.float32,
torch.float16:torch.float16,
torch.bfloat16:torch.float32,
}[weight_type]
input_type = {
torch.float32: torch.float32,
torch.float16: torch.float16,
torch.bfloat16: torch.float32,
}[weight_type]
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=weight_type)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=weight_type)
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type)
q_a_norm = torch.ones(hidden_size, dtype=torch.float32)
kv_a_norm = torch.ones(hidden_size, dtype=torch.float32)
@@ -190,7 +195,7 @@ if use_real_weights := True:
o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
o_proj.weight = nn.Parameter(o_proj_weight.view(torch.bfloat16), requires_grad=False)
else:
init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
@@ -203,16 +208,16 @@ q_absorb = x_reshaped[:, 0]
out_absorb = x_reshaped[:, 1]
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
def build_mla():
os.environ["BLAS_NUM_THREADS"] = "1"
q_a_proj_weight = q_a_proj.weight.to(weight_type).to('cpu').contiguous()
q_b_proj_weight = q_b_proj.weight.to(weight_type).to('cpu').contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(weight_type).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to('cpu').contiguous()
o_proj_weight = o_proj.weight.to(weight_type).to('cpu').contiguous()
q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous()
config = kt_kernel_ext.mla.MLAConfig(
hidden_size,
@@ -224,7 +229,7 @@ def build_mla():
)
config.max_qlen = max_qlen
config.max_kvlen = max_kvlen
config.max_position_embeddings = max_position_embeddings
config.max_position_embeddings = max_position_embeddings
config.rope_scaling_factor = rope_scaling["factor"]
config.rope_theta = rope_theta
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
@@ -244,7 +249,6 @@ def build_mla():
config.kv_a_norm = kv_a_norm.data_ptr()
config.kv_a_norm_type = ggml_type.FP32
if weight_type == torch.float32:
config.q_a_proj_type = ggml_type.FP32
config.q_b_proj_type = ggml_type.FP32
@@ -266,10 +270,8 @@ def build_mla():
else:
raise ValueError(f"Unsupported data type: {weight_type}")
config.pool = CPUInfer.backend_
if weight_type == torch.float32:
mla = kt_kernel_ext.mla.MLA_F32(config)
elif weight_type == torch.float16:
@@ -278,25 +280,20 @@ def build_mla():
mla = kt_kernel_ext.mla.MLA_F32(config)
else:
raise ValueError(f"Unsupported data type: {weight_type}")
mla.load_weights()
mla.set_local_pages(pages_count)
return mla
def load_fp32_tensor(file_path, shape):
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
# page3 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank))
# page3_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank))
@@ -320,7 +317,6 @@ def load_fp32_tensor(file_path, shape):
# print(f'PE Attention Weights Diff: ave:{diff.mean()}, max:{diff.max()}')
# raw_attn_w_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_raw_attention_weights.f32',(1,max_kvlen))
# raw_attn_w_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_raw_attention_weights.f32',(qlen,max_kvlen))
# diff = torch.abs(raw_attn_w_1 - raw_attn_w_2[-1])
@@ -334,22 +330,16 @@ def load_fp32_tensor(file_path, shape):
# print(f'Output Diff: ave:{diff.mean()}, max:{diff.max()}')
mla = build_mla()
output = torch.zeros((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
mla.forward([qlen],[page_table],[kvlen],hidden_states.data_ptr(),output.data_ptr())
print("CPU MLA Output: ",output[-1])
output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr())
print("CPU MLA Output: ", output[-1])
output_2 = torch.zeros((1, hidden_size), dtype=input_type).to('cpu').contiguous()
mla.forward([1],[page_table],[qlen-1],hidden_states[-1].data_ptr(),output_2.data_ptr())
print("CPU MLA Output 2: ",output_2[-1])
output_2 = torch.zeros((1, hidden_size), dtype=input_type).to("cpu").contiguous()
mla.forward([1], [page_table], [qlen - 1], hidden_states[-1].data_ptr(), output_2.data_ptr())
print("CPU MLA Output 2: ", output_2[-1])
diff = torch.abs(output[-1] - output_2[-1])
print(f'Diff: ave:{diff.mean()}, max:{diff.max()}')
print(f"Diff: ave:{diff.mean()}, max:{diff.max()}")
assert diff.max() < 1e-1, "CPU and Torch outputs are not close enough!"

View File

@@ -1,59 +1,62 @@
import logging
import os,sys
import os, sys
import time
from typing import Optional
os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import inf, nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
logger = logging.getLogger("reader")
from gguf.gguf_reader import GGUFReader
def load_fp32_tensor_raw(file_path):
# return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
return tensor
def load_fp16_tensor(file_path, shape=None):
# return load_fp32_tensor(file_path, shape)
return load_fp32_tensor_raw(file_path)
# return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=weight_type)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
def load_fp32_tensor(file_path, shape):
# return torch.zeros(shape)
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
raw_data = f.read()
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
tensor = tensor.view(shape) # 根据你的 shape reshape
return tensor
def test_torch():
torch.set_grad_enabled(False)
hidden_states_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_input.bin')
hidden_states_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_input.bin')
hidden_states_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_input.bin")
hidden_states_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_input.bin")
# diff = torch.abs(hidden_states_to_check_prefill - hidden_states_to_check_decode).max()
# print("hidden_states diff -> ", diff)
q_lora_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_qlora.bin')
q_lora_to_check_test_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_qlora_test.bin')
q_lora_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_qlora.bin')
q_lora_to_check_test_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_qlora_test.bin')
q_lora_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora.bin")
q_lora_to_check_test_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora_test.bin")
q_lora_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora.bin")
q_lora_to_check_test_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora_test.bin")
# diff = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
# diff_test = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
# print("q_lora max diff -> ", diff)
@@ -63,8 +66,6 @@ def test_torch():
# print("q_lora mae -> ", mae)
# print("q_lora mae test -> ", mae_test)
# q_lora_norm = q_a_layernorm(q_lora)
# q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
# q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
@@ -76,7 +77,7 @@ def test_torch():
# print("q_lora_norm mae -> ", mae)
# print("q_lora_norm diff test -> ", diff_test)
# print("q_lora_norm mae test -> ", mae_test)
# q = q_b_proj(q_lora_norm)
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
# q = q.view(qlen, num_heads, nope_size+rope_size)
@@ -85,7 +86,7 @@ def test_torch():
# q_nope, q_pe = torch.split(
# q, [nope_size, rope_size], dim=-1
# )
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
# compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
@@ -94,12 +95,11 @@ def test_torch():
# )
# compressed_kv = compressed_kv.contiguous()
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
compressed_kv_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_kv_lora_rank')
compressed_kv_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_kv_lora_rank')
compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank")
compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank")
# diff = torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode).max()
# mae = torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode))
# mae = torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode))
# print("compressed_kv diff -> ", diff)
# print("compressed_kv mae -> ", mae)
@@ -107,20 +107,17 @@ def test_torch():
# k_pe is [qlen, 1, qk_rope_head_dim(64)]
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
compressed_kv_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm')
compressed_kv_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm')
compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm")
compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm")
# diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
# print("compressed_kv diff norm -> ", diff)
# print("compressed_kv mae norm -> ", mae)
# k_pe = k_pe.view(qlen, 1, rope_size)
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
# compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
# cos, sin = rotary_emb(q_pe, batch_position_ids)
# q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
@@ -135,11 +132,11 @@ def test_torch():
# print("q_nope[0] mae -> ", mae)
# print("q_nope[0] diff test -> ", diff_test)
# print("q_nope[0] mae test -> ", mae_test)
# q_pe_nope = q_pe.transpose(0,1)
q_pe_0_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope')
q_pe_0_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope')
q_pe_0_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_q_rope")
q_pe_0_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_q_rope")
# q_pe_0_to_check_decode_test = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope_test')
# q_pe_0_to_check_prefill_test = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope_test')
@@ -180,12 +177,11 @@ def test_torch():
# q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
# q_pe = q_pe.squeeze(0)
# q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
# q_pe.transpose_(0, 1)
# q_pe.transpose_(0, 1)
# diff = torch.abs(q_pe - q_new).max()
# print("q_pe diff -> ", diff)
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
# diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
# mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
@@ -240,7 +236,7 @@ def test_torch():
# if batch_compressed_kv is None or batch_k_pe is None:
# batch_compressed_kv = tmp_compressed_kv
# batch_k_pe = tmp_k_pe
# else:
# else:
# batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
# batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
# kv_total_len -= page_size
@@ -250,16 +246,15 @@ def test_torch():
# if batch_compressed_kv is None or batch_k_pe is None:
# batch_compressed_kv = tmp_compressed_kv
# batch_k_pe = tmp_k_pe
# else:
# else:
# batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
# batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
# break
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
k_pe_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_k_rope', (256,64))
k_pe_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_k_rope', (256,64))
k_pe_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_k_rope", (256, 64))
k_pe_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_k_rope", (256, 64))
# diff = torch.abs(k_pe_to_check_prefill - k_pe_to_check_decode).max()
# mae = torch.mean(k_pe_to_check_prefill - k_pe_to_check_decode)
# print("k_pe diff -> ", diff)
@@ -267,13 +262,13 @@ def test_torch():
# pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
# kv_total_len = kv_page_nums * page_size
pe_weights_0_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_pe_attention_weights', (1024,4096))
pe_weights_0_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_pe_attention_weights', (1024,4096))
pe_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_pe_attention_weights", (1024, 4096))
pe_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_pe_attention_weights", (1024, 4096))
# diff = torch.abs(pe_weights[0] - pe_weights_0).max()
# print("pe_weights[0] diff -> ", diff)
# attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))
# attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))
# raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096))
# raw_weights = raw_weights[0:qlen, 0:kv_total_len]
@@ -282,25 +277,23 @@ def test_torch():
# attention_weights = attention_weights * softmax_scale
# attention_weights is [num_heads(128), qlen, k_len]
# attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)
# attention_masks[i] is [qlen, k_len]
# attention_weights = (attention_weights + attention_masks)
# attention_weights shape is [num_heads(128), qlen, k_len]
# attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype)
attention_weights_0_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_attention_weights', (1024, 4096))
attention_weights_0_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_attention_weights', (1024, 4096))
attention_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_attention_weights", (1024, 4096))
attention_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_attention_weights", (1024, 4096))
# attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
# diff = torch.abs(attention_weights[0] - attention_weights_0).max()
# print("attention_weights[0] diff -> ", diff)
# attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
@@ -322,7 +315,7 @@ def test_torch():
# w_o = o_proj.weight.view([hidden_size,num_heads * nope_size])
# output = torch.matmul(attn_output,w_o.transpose(0,1))
# output = output.view(qlen, hidden_size)
# output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size))
# h1_o = w_o[:,:128]
# local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128))
@@ -333,18 +326,15 @@ def test_torch():
# diff = torch.abs(h1_output - output_0_check).max()
# print("h1_output diff -> ", diff)
output_check_decode = load_fp16_tensor('./debug_decode/output.bin')
output_check_prefill = load_fp16_tensor('./debug_prefill/output.bin')
output_check_decode = load_fp16_tensor("./debug_decode/output.bin")
output_check_prefill = load_fp16_tensor("./debug_prefill/output.bin")
# diff = torch.abs(output - output_check).max()
# mae = torch.mean(torch.abs(output - output_check))
# print("output diff -> ", diff)
return None
torch.set_printoptions(sci_mode=False, precision=5)
# output_cpu = test_cpu_mla()
# output_cpu_quant = test_cpu_mla_quant()
@@ -361,7 +351,3 @@ output_torch = test_torch()
# print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}')
# assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"

View File

@@ -1,13 +1,14 @@
import os,sys
import os, sys
import time
from typing import Optional
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
from torch import nn
from torch.nn import init
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
seed = 42 # 你可以选择任何整数作为种子
@@ -19,7 +20,7 @@ kvlen = 0
page_table = range(20)
bsz_tensors=torch.tensor([1])
bsz_tensors = torch.tensor([1])
page_size = 256
@@ -38,8 +39,7 @@ rope_theta = 10000
max_qlen = 1024
max_kvlen = 4096
max_position_embeddings = 163840
max_position_embeddings = 163840
rope_scaling = {
@@ -49,17 +49,16 @@ rope_scaling = {
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
"type": "yarn",
}
CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 100
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16)
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16)
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
kv_b_proj = nn.Linear(kv_lora_rank, num_heads * (nope_size + nope_size), bias=False, dtype=torch.float16)
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)
@@ -70,13 +69,11 @@ init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
init.normal_(o_proj.weight, mean=0.0, std=0.02)
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to('cpu').contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to('cpu').contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to('cpu').contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to('cpu').contiguous()
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous()
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous()
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous()
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous()
o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous()
config = kt_kernel_ext.mla.MLAConfig(
@@ -89,7 +86,7 @@ config = kt_kernel_ext.mla.MLAConfig(
)
config.max_qlen = max_qlen
config.max_kvlen = max_kvlen
config.max_position_embeddings = max_position_embeddings
config.max_position_embeddings = max_position_embeddings
config.rope_scaling_factor = rope_scaling["factor"]
config.rope_theta = rope_theta
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
@@ -114,30 +111,27 @@ config.w_o_type = ggml_type.FP16
config.pool = CPUInfer.backend_
mla = kt_kernel_ext.mla.MLA(config)
mla.load_weights()
mla.set_local_pages(pages_count)
input = torch.randn((qlen, hidden_size), dtype=torch.float16).to('cpu').contiguous()
input = torch.randn((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous()
output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to('cpu').contiguous()
mla.forward([qlen],[page_table],[kvlen],input.data_ptr(),output.data_ptr())
print("CPU MLA Output: ",output)
output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous()
mla.forward([qlen], [page_table], [kvlen], input.data_ptr(), output.data_ptr())
print("CPU MLA Output: ", output)
softmax_scale = (nope_size + rope_size) ** -0.5
# 1代表的是压缩的kv的头数
k_caches = torch.randn(1,pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16)
k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16)
kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)
q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)
x = torch.randn(q_lora_rank, dtype=torch.float16)*100
x = torch.randn(q_lora_rank, dtype=torch.float16) * 100
print(x)
print(q_a_layernorm(x))
@@ -163,110 +157,114 @@ rotary_emb = DeepseekV3YarnRotaryEmbedding(
# last_page_len = [qlen+kvlen,...] layer_idx = 1
# position_ids = [kvlen:qlen+kvlen]
hidden_states = torch.randn(qlen, hidden_size, dtype=torch.float16)
q_indptr = torch.tensor([0,qlen]).to(torch.int32)
q_indptr = torch.tensor([0, qlen]).to(torch.int32)
kv_indptr = torch.tensor([0,(qlen+kvlen+page_size-1)//page_size]).to(torch.int32)
kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32)
kv_indices = torch.tensor(range(pages_count)).to(torch.int32)
page_idx = torch.tensor([i//page_size for i in range(kvlen,kvlen+qlen)] ).to(torch.int32)
page_offset = torch.tensor( [i%page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
last_page_len = torch.tensor([(qlen+kvlen)%page_size], device=hidden_states.device)
last_page_len = torch.tensor([(qlen + kvlen) % page_size], device=hidden_states.device)
position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32)
# 按照行创建 mask [qlen,kvlen+qlen]
attention_masks = torch.zeros((qlen, kvlen + qlen), dtype=torch.float16)
for i in range(qlen):
attention_masks[i, i + kvlen + 1: i + kvlen + qlen] = -65504.0
attention_masks[i, i + kvlen + 1 : i + kvlen + qlen] = -65504.0
def torch_attn(hidden_states: torch.Tensor,
kv_cache: KDeepSeekV3Cache,
position_ids: torch.Tensor,
page_idx: torch.Tensor,
page_offset: torch.Tensor,
attention_masks: Optional[list[torch.Tensor]] = None,
q_indptr: Optional[torch.Tensor] = None,
kv_indices: Optional[torch.Tensor] = None,
kv_indptr: Optional[torch.Tensor] = None,
bsz_tensors: Optional[torch.Tensor] = None,
last_page_len: Optional[torch.Tensor] = None,
layer_idx: Optional[int] = None,
):
def torch_attn(
hidden_states: torch.Tensor,
kv_cache: KDeepSeekV3Cache,
position_ids: torch.Tensor,
page_idx: torch.Tensor,
page_offset: torch.Tensor,
attention_masks: Optional[list[torch.Tensor]] = None,
q_indptr: Optional[torch.Tensor] = None,
kv_indices: Optional[torch.Tensor] = None,
kv_indptr: Optional[torch.Tensor] = None,
bsz_tensors: Optional[torch.Tensor] = None,
last_page_len: Optional[torch.Tensor] = None,
layer_idx: Optional[int] = None,
):
global out_absorb
global q_absorb
# range bsz_tensors
final_attention_output = torch.tensor([], device=hidden_states.device)
for i in range(bsz_tensors[0]):
batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i]
batch_last_page_len = last_page_len[i]
# kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]]
batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]]
# kv_page_nums is the number of pages for the current batch
kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
kv_page_nums = kv_indptr[i + 1] - kv_indptr[i]
# kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
kv_total_len = kv_page_nums * page_size
if batch_last_page_len is not None:
kv_total_len = kv_total_len - (page_size - batch_last_page_len)
# print(f"kv_total_len's shape {kv_total_len.shape}")
# kv_index is the index of the kv cache pages for the current batch
kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]]
# we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
# from q_indptr[i] to q_indptr[i+1] is the range of the current batch
batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]]
batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]]
qlen, _ = batch_hidden_states.size()
# print("qlen -> ", qlen)
q_lora = q_a_proj(batch_hidden_states)
print('q_a_proj',q_a_proj.weight)
print('q_lora',q_lora)
print("q_a_proj", q_a_proj.weight)
print("q_lora", q_lora)
q = q_b_proj(q_a_layernorm(q_lora))
print('q_b_proj',q_b_proj.weight)
print("q_b_proj", q_b_proj.weight)
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
q = q.view(qlen, num_heads, nope_size+rope_size)
q = q.view(qlen, num_heads, nope_size + rope_size)
# q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
# q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
q_nope, q_pe = torch.split(
q, [nope_size, rope_size], dim=-1
)
print('q_nope',q_nope)
print('q_pe',q_pe)
q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1)
print("q_nope", q_nope)
print("q_pe", q_pe)
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
compressed_kv, k_pe = torch.split(
compressed_kv, [kv_lora_rank, rope_size], dim=-1
)
compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1)
compressed_kv = compressed_kv.contiguous()
compressed_kv = kv_a_layernorm(compressed_kv)
# k_pe is [qlen, 1, qk_rope_head_dim(64)]
print('compressed_kv ',compressed_kv)
print('k_pe ',k_pe)
print("compressed_kv ", compressed_kv)
print("k_pe ", k_pe)
k_pe = k_pe.view(qlen, 1, rope_size)
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
cos, sin = rotary_emb(q_pe, batch_position_ids)
# print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
q_pe = q_pe.squeeze(0)
# q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
q_pe.transpose_(0, 1)
q_pe.transpose_(0, 1)
if kv_cache is not None:
cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models
compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
cache_kwargs = {
"sin": sin,
"cos": cos,
"page_idx": batch_page_idx,
"page_offset": batch_page_offset,
} # Specific to RoPE models
compressed_kv_with_k_pe = kv_cache.update(
compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs
)
compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
# q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
# out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
# q_absorb, out_absorb = get_absorbed()
# q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)]
q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
# q_nope is [num_heads(128), qlen, kv_lora_rank(512)]
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
# # q_nope is [qlen, num_heads(128), kv_lora_rank(512)]
# q_nope = q_nope.transpose(0, 1)
@@ -281,7 +279,7 @@ def torch_attn(hidden_states: torch.Tensor,
if batch_compressed_kv is None or batch_k_pe is None:
batch_compressed_kv = tmp_compressed_kv
batch_k_pe = tmp_k_pe
else:
else:
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
kv_total_len -= page_size
@@ -291,57 +289,48 @@ def torch_attn(hidden_states: torch.Tensor,
if batch_compressed_kv is None or batch_k_pe is None:
batch_compressed_kv = tmp_compressed_kv
batch_k_pe = tmp_k_pe
else:
else:
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
break
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
print('pe_weights',pe_weights)
pe_weights = torch.matmul(q_pe, batch_k_pe.mT)
print("pe_weights", pe_weights)
attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) * softmax_scale
# attention_weights is [num_heads(128), qlen, k_len]
# attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)
# attention_masks[i] is [qlen, k_len]
attention_weights = (attention_weights + attention_masks[i])
attention_weights = attention_weights + attention_masks[i]
# attention_weights shape is [num_heads(128), qlen, k_len]
attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=torch.float16).to(q_pe.dtype)
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=torch.float16).to(q_pe.dtype)
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
out_absorb = out_absorb.transpose(1,2)
out_absorb = out_absorb.transpose(1, 2)
# q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
attn_output = attn_output.reshape(qlen, num_heads * nope_size)
attn_output = o_proj(attn_output)
final_attention_output = torch.cat((final_attention_output, attn_output), dim=0)
return final_attention_output
torch_output = torch_attn(
input,
kv_cache,
position_ids,
page_idx,
page_offset,
attention_masks=attention_masks,
q_indptr=q_indptr,
kv_indices=kv_indices,
kv_indptr=kv_indptr,
bsz_tensors=bsz_tensors,
last_page_len=last_page_len,
layer_idx=0
)
print("Torch Output: ",torch_output)
input,
kv_cache,
position_ids,
page_idx,
page_offset,
attention_masks=attention_masks,
q_indptr=q_indptr,
kv_indices=kv_indices,
kv_indptr=kv_indptr,
bsz_tensors=bsz_tensors,
last_page_len=last_page_len,
layer_idx=0,
)
print("Torch Output: ", torch_output)

View File

@@ -1,36 +1,39 @@
#!/usr/bin/env python
# coding=utf-8
'''
Description :
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-08-06 10:37:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.append(os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
hidden_size = 5120
intermediate_size = 3072
stride = 32
group_max_len = 1024
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
gate_type = 1 # ggml_type::GGML_TYPE_F16
up_type = 1 # ggml_type::GGML_TYPE_F16
down_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
qlen = 30
layer_num = 10
CPUInfer = kt_kernel_ext.CPUInfer(48)
validation_iter = 100
def act_fn(x):
return x / (1.0 + torch.exp(-x))
def mlp_torch(input, gate_proj, up_proj, down_proj):
gate_buf = torch.mm(input, gate_proj.t())
up_buf = torch.mm(input, up_proj.t())
@@ -38,16 +41,35 @@ def mlp_torch(input, gate_proj, up_proj, down_proj):
ret = torch.mm(intermediate, down_proj.t())
return ret
with torch.inference_mode(mode=True):
mlps = []
gate_projs = []
up_projs = []
down_projs = []
for _ in range(layer_num):
gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
config = kt_kernel_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
gate_proj = (
torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
)
up_proj = (
torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
)
down_proj = (
torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
)
config = kt_kernel_ext.mlp.MLPConfig(
hidden_size,
intermediate_size,
stride,
group_max_len,
gate_proj.data_ptr(),
up_proj.data_ptr(),
down_proj.data_ptr(),
gate_type,
up_type,
down_type,
hidden_type,
)
mlp = kt_kernel_ext.mlp.MLP(config)
gate_projs.append(gate_proj)
up_projs.append(up_proj)
@@ -61,22 +83,16 @@ with torch.inference_mode(mode=True):
output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
input = input / 100
CPUInfer.submit(
mlp.forward(
qlen,
input.data_ptr(),
output.data_ptr()
)
)
CPUInfer.submit(mlp.forward(qlen, input.data_ptr(), output.data_ptr()))
CPUInfer.sync()
# print('cpuinfer output', output)
gate_proj = gate_projs[i%layer_num]
up_proj = up_projs[i%layer_num]
down_proj = down_projs[i%layer_num]
gate_proj = gate_projs[i % layer_num]
up_proj = up_projs[i % layer_num]
down_proj = down_projs[i % layer_num]
t_output = mlp_torch(input, gate_proj, up_proj, down_proj)
# print('torch output', t_output)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print('diff = ', diff)
assert(diff < 0.001)
print("diff = ", diff)
assert diff < 0.001

View File

@@ -1,18 +1,19 @@
#!/usr/bin/env python
# coding=utf-8
'''
Description :
"""
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : SkqLiao
LastEditors : SkqLiao
LastEditTime : 2025-03-13 11:38:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
import kt_kernel_ext
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
import torch
from tqdm import tqdm
from kt_kernel_ext.kvcache import ggml_type
@@ -20,7 +21,7 @@ from kt_kernel_ext.kvcache import ggml_type
torch.manual_seed(0)
expert_num = 8
hidden_size = 2048 #7168
hidden_size = 2048 # 7168
intermediate_size = 2048
stride = 32
group_min_len = 10
@@ -39,9 +40,11 @@ layer_num = 1
CPUInfer = kt_kernel_ext.CPUInfer(64)
validation_iter = 10
def act_fn(x):
return x / (1.0 + torch.exp(-x))
def mlp_torch(input, gate_proj, up_proj, down_proj):
gate_buf = torch.mm(input, gate_proj.t())
up_buf = torch.mm(input, up_proj.t())
@@ -49,6 +52,7 @@ def mlp_torch(input, gate_proj, up_proj, down_proj):
ret = torch.mm(intermediate, down_proj.t())
return ret
def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
cnts.scatter_(1, expert_ids, 1)
@@ -85,10 +89,12 @@ def to_cpuinfer_tensor(tensor, type):
size = torch.prod(torch.tensor(tensor.shape, dtype=torch.int32)).item()
return kt_kernel_ext.utils.from_float(tensor.data_ptr(), size, type)
def from_cpuinfer_tensor(tensor, size, type):
return kt_kernel_ext.utils.to_float(tensor.data_ptr(), size, type)
qlens = [1,64] #[64, 512, 2048, 8192, 16384]
qlens = [1, 64] # [64, 512, 2048, 8192, 16384]
# gate_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K]
# up_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K]
# down_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q5_K]
@@ -96,8 +102,8 @@ gate_types = [ggml_type.Q4_K]
up_types = [ggml_type.Q4_K]
down_types = [ggml_type.Q6_K]
hidden_type = ggml_type.BF16
print(f'Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}')
print(f'group_max_len: ', group_max_len)
print(f"Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}")
print(f"group_max_len: ", group_max_len)
for qlen in qlens:
for gate_type, up_type, down_type in zip(gate_types, up_types, down_types):
@@ -106,18 +112,30 @@ for qlen in qlens:
gate_projs = []
up_projs = []
down_projs = []
print('Preparing data...')
print("Preparing data...")
converted_tensors = []
for _ in range(layer_num):
size = expert_num * intermediate_size * hidden_size
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
gate_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
up_proj = (
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
down_proj = (
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
.to("cpu")
.contiguous()
)
gate_tensor = to_cpuinfer_tensor(gate_proj, gate_type)
up_tensor = to_cpuinfer_tensor(up_proj, up_type)
down_tensor = to_cpuinfer_tensor(down_proj, down_type)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
config.pool = CPUInfer.backend_
config.stride = stride
@@ -131,59 +149,62 @@ for qlen in qlens:
config.down_type = down_type
config.hidden_type = hidden_type
moe = kt_kernel_ext.moe.MOE(config)
gate_projs.append(gate_proj)
up_projs.append(up_proj)
down_projs.append(down_proj)
down_projs.append(down_proj)
CPUInfer.submit(moe.load_weights_task())
CPUInfer.sync()
moes.append(moe)
converted_tensors.append((gate_tensor, up_tensor, down_tensor))
print('Finished initialization!')
print("Finished initialization!")
CPUInfer.submit(moes[0].warm_up_task())
CPUInfer.sync()
print('Warm up finished!')
print("Warm up finished!")
# validation
progress_bar = tqdm(range(validation_iter), desc="Starting")
total_diff = 0
for i in tqdm(progress_bar):
progress_bar.set_description('Round: {}/{}'.format(i + 1, validation_iter))
expert_ids = torch.stack([torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]).contiguous()
progress_bar.set_description("Round: {}/{}".format(i + 1, validation_iter))
expert_ids = torch.stack(
[torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
).contiguous()
weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
input_proj = torch.randn((qlen, hidden_size), dtype=torch.float32).contiguous() / 100
output_proj = torch.empty((qlen, hidden_size), dtype=torch.float32).contiguous()
input_tensor = to_cpuinfer_tensor(input_proj, hidden_type)
output_tensor = to_cpuinfer_tensor(output_proj, hidden_type)
qlen_tensor = torch.tensor([qlen], dtype=torch.int32)
moe = moes[i % layer_num]
CPUInfer.submit(
moe.forward_task(
moe.forward_task(
qlen_tensor.data_ptr(),
num_experts_per_tok,
expert_ids.data_ptr(),
weights.data_ptr(),
input_tensor.data_ptr(),
num_experts_per_tok,
expert_ids.data_ptr(),
weights.data_ptr(),
input_tensor.data_ptr(),
output_tensor.data_ptr(),
)
)
CPUInfer.sync()
cpu_output = from_cpuinfer_tensor(output_tensor, qlen * hidden_size, hidden_type)
gate_proj = gate_projs[i%layer_num]
up_proj = up_projs[i%layer_num]
down_proj = down_projs[i%layer_num]
gate_proj = gate_projs[i % layer_num]
up_proj = up_projs[i % layer_num]
down_proj = down_projs[i % layer_num]
t_output = moe_torch(input_proj, expert_ids, weights, gate_proj, up_proj, down_proj)
print('cpuinfer output', cpu_output)
print('torch output', t_output)
diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean(torch.abs(t_output.flatten()))
print("cpuinfer output", cpu_output)
print("torch output", t_output)
diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean(
torch.abs(t_output.flatten())
)
assert diff < 0.5
total_diff += diff
print(f'gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}')
print(f'Average diff: {total_diff / validation_iter:.4f}')
print(f"gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}")
print(f"Average diff: {total_diff / validation_iter:.4f}")

View File

@@ -4,7 +4,7 @@ sys.path.insert(0, os.path.dirname(__file__) + "/../build")
print("sys.path:", sys.path)
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
expert_num = 256
hidden_size = 7168

View File

@@ -15,7 +15,7 @@ import time
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
os.environ["BLAS_NUM_THREADS"] = "1"
import torch
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
expert_num = 16

View File

@@ -14,7 +14,7 @@ import time
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
os.environ["BLAS_NUM_THREADS"] = "1"
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
import torch
expert_num = 16

View File

@@ -15,7 +15,7 @@ from abc import ABC, abstractmethod
import os
import ctypes
import kt_kernel_ext
from kt_kernel import kt_kernel_ext
class KExpertsCPUBuffer: