mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-03-14 18:37:23 +00:00
[fix](test): fix import kt-kernel (#1728)
This commit is contained in:
@@ -1,19 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
"""
|
||||
Description :
|
||||
Description :
|
||||
Author : Jianwei Dong
|
||||
Date : 2024-08-28 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : Jianwei Dong
|
||||
LastEditors : Jianwei Dong
|
||||
LastEditTime : 2024-08-28 10:32:05
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys
|
||||
import time
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/../build")
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
layer_num = 10
|
||||
@@ -61,11 +61,7 @@ def bench_linear(cache_seqlen: int):
|
||||
max_thread_num,
|
||||
)
|
||||
local_kvcache = kt_kernel_ext.kvcache.KVCache(config)
|
||||
block_table = (
|
||||
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
|
||||
.contiguous()
|
||||
.view(1, -1)
|
||||
)
|
||||
block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1)
|
||||
|
||||
for layer_idx in range(layer_num):
|
||||
k_cache = torch.randn(
|
||||
@@ -93,17 +89,11 @@ def bench_linear(cache_seqlen: int):
|
||||
)
|
||||
CPUInfer.sync()
|
||||
|
||||
input = torch.randn(
|
||||
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
|
||||
).contiguous()
|
||||
output = torch.empty(
|
||||
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
|
||||
).contiguous()
|
||||
input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
|
||||
output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
|
||||
|
||||
# attn_lse: (bsz, q_len, q_head_num)
|
||||
attn_lse = torch.empty(
|
||||
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
|
||||
).contiguous()
|
||||
attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous()
|
||||
input = input / 100
|
||||
|
||||
# warm up
|
||||
@@ -156,16 +146,7 @@ def bench_linear(cache_seqlen: int):
|
||||
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
|
||||
print(
|
||||
"Bandwidth: ",
|
||||
cache_seqlen
|
||||
* kv_head_num
|
||||
* head_dim
|
||||
* 2
|
||||
* 2
|
||||
* test_iter
|
||||
/ total_time
|
||||
/ 1000
|
||||
/ 1000
|
||||
/ 1000,
|
||||
cache_seqlen * kv_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000,
|
||||
"GB/s",
|
||||
)
|
||||
print("")
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
"""
|
||||
Description :
|
||||
Description :
|
||||
Author : Jianwei Dong
|
||||
Date : 2024-08-28 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : Jianwei Dong
|
||||
LastEditors : Jianwei Dong
|
||||
LastEditTime : 2024-08-28 10:32:05
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys
|
||||
import time
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/../build")
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
layer_num = 10
|
||||
@@ -45,9 +45,7 @@ def bench_linear(cache_seqlen: int, device):
|
||||
|
||||
kvcaches.append((k_cache, v_cache))
|
||||
|
||||
input = torch.randn(
|
||||
(1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
|
||||
).contiguous()
|
||||
input = torch.randn((1, q_head_num, 1, head_dim), dtype=torch.float16, device=device).contiguous()
|
||||
input = input / 100
|
||||
|
||||
# warm up
|
||||
@@ -70,16 +68,7 @@ def bench_linear(cache_seqlen: int, device):
|
||||
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
|
||||
print(
|
||||
"Bandwidth: ",
|
||||
cache_seqlen
|
||||
* q_head_num
|
||||
* head_dim
|
||||
* 2
|
||||
* 2
|
||||
* test_iter
|
||||
/ total_time
|
||||
/ 1000
|
||||
/ 1000
|
||||
/ 1000,
|
||||
cache_seqlen * q_head_num * head_dim * 2 * 2 * test_iter / total_time / 1000 / 1000 / 1000,
|
||||
"GB/s",
|
||||
)
|
||||
print("")
|
||||
|
||||
@@ -15,7 +15,7 @@ from tqdm import tqdm
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
# Benchmark parameters (single MoE, no layer loop)
|
||||
@@ -29,9 +29,7 @@ warm_up_iter = 1000
|
||||
test_iter = 5000
|
||||
k_group_size = 32
|
||||
|
||||
physical_to_logical_map = (
|
||||
torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
|
||||
)
|
||||
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
|
||||
|
||||
worker_config = kt_kernel_ext.WorkerPoolConfig()
|
||||
worker_config.subpool_count = 2
|
||||
@@ -43,24 +41,12 @@ CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
|
||||
def get_git_commit():
|
||||
result = {}
|
||||
try:
|
||||
commit = (
|
||||
subprocess.check_output(["git", "rev-parse", "HEAD"])
|
||||
.decode("utf-8")
|
||||
.strip()
|
||||
)
|
||||
commit_msg = (
|
||||
subprocess.check_output(["git", "log", "-1", "--pretty=%B"])
|
||||
.decode("utf-8")
|
||||
.strip()
|
||||
)
|
||||
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
|
||||
commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
|
||||
result["commit"] = commit
|
||||
result["commit_message"] = commit_msg
|
||||
|
||||
dirty_output = (
|
||||
subprocess.check_output(["git", "status", "--porcelain"])
|
||||
.decode("utf-8")
|
||||
.strip()
|
||||
)
|
||||
dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
|
||||
if dirty_output:
|
||||
result["dirty"] = True
|
||||
result["dirty_files"] = dirty_output.splitlines()
|
||||
@@ -132,9 +118,7 @@ def record_results(result, filename=json_path):
|
||||
f.write(json.dumps(result) + "\n")
|
||||
|
||||
|
||||
def pack_to_int32(
|
||||
value: torch.Tensor, num_bits: int, packed_dim: int = 1
|
||||
) -> torch.Tensor:
|
||||
def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: int = 1) -> torch.Tensor:
|
||||
if value.dtype is not torch.int8:
|
||||
raise ValueError("Tensor must be torch.int8 before packing")
|
||||
if not (1 <= num_bits <= 8):
|
||||
@@ -181,9 +165,7 @@ def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
|
||||
weights_f32 = weights.to(torch.float32)
|
||||
e, rows, cols = weights_f32.shape
|
||||
if cols % group_size != 0 or cols % 2 != 0:
|
||||
raise ValueError(
|
||||
f"cols ({cols}) must be divisible by group_size ({group_size}) and 2"
|
||||
)
|
||||
raise ValueError(f"cols ({cols}) must be divisible by group_size ({group_size}) and 2")
|
||||
|
||||
reshaped = weights_f32.view(e, rows, cols // group_size, group_size)
|
||||
max_abs = reshaped.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8)
|
||||
@@ -191,9 +173,7 @@ def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
|
||||
q = torch.round(reshaped / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8)
|
||||
q = q.view(e, rows, cols)
|
||||
packed = pack_tensor_per_row(q, num_bits=4).view(e, rows, cols // 8).contiguous()
|
||||
scales = scales.to(torch.bfloat16).contiguous().view(
|
||||
e, rows, cols // group_size
|
||||
).contiguous()
|
||||
scales = scales.to(torch.bfloat16).contiguous().view(e, rows, cols // group_size).contiguous()
|
||||
return packed, scales
|
||||
|
||||
|
||||
@@ -233,9 +213,7 @@ def bench_k2_moe():
|
||||
bytes_per_elem = 0.5 + 2.0 / k_group_size
|
||||
|
||||
quant_data = build_quantized_layer_weights()
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
|
||||
)
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
|
||||
config.max_len = max_len
|
||||
config.quant_config.bits = 4
|
||||
config.quant_config.group_size = k_group_size
|
||||
@@ -261,12 +239,8 @@ def bench_k2_moe():
|
||||
.reshape(gen_iter, qlen * num_experts_per_tok)
|
||||
.contiguous()
|
||||
)
|
||||
weights = torch.rand(
|
||||
(gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu"
|
||||
).contiguous()
|
||||
input_tensor = torch.randn(
|
||||
(qlen, hidden_size), dtype=torch.bfloat16, device="cpu"
|
||||
).contiguous()
|
||||
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
|
||||
input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
|
||||
output_tensor = torch.empty_like(input_tensor)
|
||||
bsz_tensor = torch.tensor([qlen], device="cpu")
|
||||
|
||||
@@ -313,17 +287,7 @@ def bench_k2_moe():
|
||||
/ total_time
|
||||
/ 1e9
|
||||
)
|
||||
flops = (
|
||||
hidden_size
|
||||
* intermediate_size
|
||||
* qlen
|
||||
* 3
|
||||
* num_experts_per_tok
|
||||
* 2
|
||||
* test_iter
|
||||
/ total_time
|
||||
/ 1e12
|
||||
)
|
||||
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
|
||||
|
||||
print("Quant mode: int4_k2")
|
||||
print("Time(s): ", total_time)
|
||||
|
||||
@@ -14,7 +14,7 @@ from tqdm import tqdm
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
# Benchmark parameters (single MoE, mirror examples/test_k2_write_buffer.py)
|
||||
@@ -39,20 +39,12 @@ CPUInfer = kt_kernel_ext.CPUInfer(96)
|
||||
def get_git_commit():
|
||||
result = {}
|
||||
try:
|
||||
commit = (
|
||||
subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
|
||||
)
|
||||
commit_msg = (
|
||||
subprocess.check_output(["git", "log", "-1", "--pretty=%B"])
|
||||
.decode("utf-8")
|
||||
.strip()
|
||||
)
|
||||
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
|
||||
commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
|
||||
result["commit"] = commit
|
||||
result["commit_message"] = commit_msg
|
||||
|
||||
dirty_output = (
|
||||
subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
|
||||
)
|
||||
dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
|
||||
if dirty_output:
|
||||
result["dirty"] = True
|
||||
result["dirty_files"] = dirty_output.splitlines()
|
||||
@@ -160,9 +152,7 @@ def build_moe():
|
||||
per_mat_scale_elems,
|
||||
) = allocate_weights()
|
||||
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
expert_num, num_experts_per_tok, hidden_size, intermediate_size
|
||||
)
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
|
||||
config.max_len = max_len
|
||||
config.quant_config.bits = 4
|
||||
config.quant_config.group_size = group_size
|
||||
@@ -186,18 +176,10 @@ def build_moe():
|
||||
total_weight_bytes_per_tp = gpu_experts_num * weight_bytes_per_expert_per_tp
|
||||
total_scale_elems_per_tp = gpu_experts_num * scale_elems_per_expert_per_tp
|
||||
|
||||
w13_weight_bufs = [
|
||||
torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)
|
||||
]
|
||||
w13_scale_bufs = [
|
||||
torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)
|
||||
]
|
||||
w2_weight_bufs = [
|
||||
torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)
|
||||
]
|
||||
w2_scale_bufs = [
|
||||
torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)
|
||||
]
|
||||
w13_weight_bufs = [torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
|
||||
w13_scale_bufs = [torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)]
|
||||
w2_weight_bufs = [torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
|
||||
w2_scale_bufs = [torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)]
|
||||
|
||||
buffer_ptrs = {
|
||||
"w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
|
||||
@@ -248,7 +230,7 @@ def bench_write_buffer():
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
|
||||
|
||||
total_time = 0
|
||||
for _ in tqdm(range(test_iter), desc="Testing"):
|
||||
start = time.perf_counter()
|
||||
@@ -265,8 +247,6 @@ def bench_write_buffer():
|
||||
time.sleep(0.6)
|
||||
print(end - start)
|
||||
|
||||
|
||||
|
||||
time_per_iter_us = total_time / test_iter * 1e6
|
||||
bandwidth_gbs = bytes_per_call * test_iter / total_time / 1e9
|
||||
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Description :
|
||||
"""
|
||||
Description :
|
||||
Author : chenht2022
|
||||
Date : 2024-07-25 10:31:59
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-08-06 10:35:35
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys
|
||||
import time
|
||||
sys.path.append(os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
input_size = 16384
|
||||
@@ -25,65 +26,64 @@ CPUInfer = kt_kernel_ext.CPUInfer(64)
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
|
||||
def bench_linear(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
|
||||
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
if quant_mode == "fp32":
|
||||
proj_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
proj_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
bytes_per_elem = 4.000000
|
||||
elif quant_mode == "fp16":
|
||||
proj_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
proj_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
bytes_per_elem = 2.000000
|
||||
elif quant_mode == "bf16":
|
||||
proj_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
proj_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
bytes_per_elem = 2.000000
|
||||
elif quant_mode == "q8_0":
|
||||
proj_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
proj_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
bytes_per_elem = 1.062500
|
||||
elif quant_mode == "q6_k":
|
||||
proj_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
proj_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
bytes_per_elem = 0.820312
|
||||
elif quant_mode == "q5_k_m":
|
||||
proj_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
proj_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
bytes_per_elem = 0.687500
|
||||
elif quant_mode == "q4_k_m":
|
||||
proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
bytes_per_elem = 0.562500
|
||||
elif quant_mode == "q3_k_m":
|
||||
proj_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
proj_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
bytes_per_elem = 0.429688
|
||||
elif quant_mode == "q2_k":
|
||||
proj_type = 10 # ggml_type::GGML_TYPE_Q2_K
|
||||
proj_type = 10 # ggml_type::GGML_TYPE_Q2_K
|
||||
bytes_per_elem = 0.328125
|
||||
elif quant_mode == "iq3_xs":
|
||||
proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
proj_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
bytes_per_elem = 0.429688
|
||||
elif quant_mode == "iq2_xxs":
|
||||
proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
proj_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
bytes_per_elem = 0.257812
|
||||
else:
|
||||
assert(False)
|
||||
assert False
|
||||
|
||||
linears = []
|
||||
projs = []
|
||||
for _ in range(layer_num):
|
||||
proj = torch.randn((output_size, input_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
config = kt_kernel_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
|
||||
proj = torch.randn((output_size, input_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
||||
config = kt_kernel_ext.linear.LinearConfig(
|
||||
input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type
|
||||
)
|
||||
linear = kt_kernel_ext.linear.Linear(config)
|
||||
projs.append(proj)
|
||||
linears.append(linear)
|
||||
input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
input = torch.randn((layer_num, qlen, input_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
||||
output = torch.empty((layer_num, qlen, output_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
CPUInfer.submit(
|
||||
linears[i % layer_num].forward(
|
||||
qlen,
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
|
||||
)
|
||||
CPUInfer.sync()
|
||||
|
||||
@@ -91,21 +91,22 @@ def bench_linear(quant_mode: str):
|
||||
start = time.perf_counter()
|
||||
for i in range(test_iter):
|
||||
CPUInfer.submit(
|
||||
linears[i % layer_num].forward(
|
||||
qlen,
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
linears[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
|
||||
)
|
||||
CPUInfer.sync()
|
||||
end = time.perf_counter()
|
||||
total_time = end - start
|
||||
print('Quant mode: ', quant_mode)
|
||||
print('Time(s): ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
print('Time(us) per iteration: ', total_time / test_iter * 1000000)
|
||||
print('Bandwidth: ', input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
|
||||
print('')
|
||||
print("Quant mode: ", quant_mode)
|
||||
print("Time(s): ", total_time)
|
||||
print("Iteration: ", test_iter)
|
||||
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
|
||||
print(
|
||||
"Bandwidth: ",
|
||||
input_size * output_size * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000,
|
||||
"GB/s",
|
||||
)
|
||||
print("")
|
||||
|
||||
|
||||
bench_linear("fp32")
|
||||
bench_linear("fp16")
|
||||
|
||||
@@ -3,9 +3,10 @@ import time
|
||||
import subprocess
|
||||
import platform
|
||||
import json
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
|
||||
import kt_kernel_ext
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
from torch import inf, nn
|
||||
@@ -31,9 +32,9 @@ layer_num = 10
|
||||
|
||||
|
||||
rope_theta = 10000
|
||||
max_qlen = qlen+kvlen
|
||||
max_qlen = qlen + kvlen
|
||||
max_kvlen = 4096
|
||||
max_position_embeddings = 163840
|
||||
max_position_embeddings = 163840
|
||||
|
||||
rope_scaling = {
|
||||
"beta_fast": 32,
|
||||
@@ -42,7 +43,7 @@ rope_scaling = {
|
||||
"mscale": 1.0,
|
||||
"mscale_all_dim": 1.0,
|
||||
"original_max_position_embeddings": 4096,
|
||||
"type": "yarn"
|
||||
"type": "yarn",
|
||||
}
|
||||
|
||||
CPUINFER_PARAM = 304
|
||||
@@ -54,13 +55,12 @@ warm_up_iter = 20
|
||||
test_iter = 100
|
||||
|
||||
|
||||
|
||||
|
||||
# 获取脚本相关信息,用于生成结果保存文件名
|
||||
script_path = os.path.abspath(__file__)
|
||||
script_dir = os.path.dirname(script_path)
|
||||
script_name = os.path.splitext(os.path.basename(script_path))[0]
|
||||
json_path = os.path.join(script_dir, "bench_results "+ ".jsonl")
|
||||
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")
|
||||
|
||||
|
||||
def get_git_commit():
|
||||
"""
|
||||
@@ -100,9 +100,9 @@ def get_system_info():
|
||||
|
||||
# 获取 CPU 型号(仅 Linux 支持)
|
||||
cpu_model = None
|
||||
if os.path.exists('/proc/cpuinfo'):
|
||||
if os.path.exists("/proc/cpuinfo"):
|
||||
try:
|
||||
with open('/proc/cpuinfo', 'r') as f:
|
||||
with open("/proc/cpuinfo", "r") as f:
|
||||
for line in f:
|
||||
if "model name" in line:
|
||||
cpu_model = line.split(":", 1)[1].strip()
|
||||
@@ -113,9 +113,9 @@ def get_system_info():
|
||||
|
||||
# 获取内存大小(单位:GB),仅 Linux 支持
|
||||
mem_total_gb = None
|
||||
if os.path.exists('/proc/meminfo'):
|
||||
if os.path.exists("/proc/meminfo"):
|
||||
try:
|
||||
with open('/proc/meminfo', 'r') as f:
|
||||
with open("/proc/meminfo", "r") as f:
|
||||
for line in f:
|
||||
if "MemTotal" in line:
|
||||
mem_kb = float(line.split(":", 1)[1].split()[0])
|
||||
@@ -149,6 +149,7 @@ def record_results(result, filename=json_path):
|
||||
with open(filename, "a") as f:
|
||||
f.write(json.dumps(result) + "\n")
|
||||
|
||||
|
||||
def bench_mla(quant_mode: str):
|
||||
"""
|
||||
测试 MLA 模型的性能
|
||||
@@ -171,22 +172,22 @@ def bench_mla(quant_mode: str):
|
||||
w_o_type = 1
|
||||
bytes_per_elem = 2.000000
|
||||
elif quant_mode == "q4_k_m":
|
||||
q_a_proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
q_a_proj_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
q_b_proj_type = 12
|
||||
kv_a_proj_with_mqa_type = 12 # ggml_type::GGML_TYPE_Q6_K
|
||||
kv_a_proj_with_mqa_type = 12 # ggml_type::GGML_TYPE_Q6_K
|
||||
kv_b_proj_type = 12
|
||||
w_o_type = 12
|
||||
bytes_per_elem = 0.5625
|
||||
else:
|
||||
raise ValueError("不支持的量化模式")
|
||||
|
||||
# 构建各层 MLA 模型的输入数据
|
||||
|
||||
# 构建各层 MLA 模型的输入数据
|
||||
mlas = []
|
||||
for i in tqdm(range(layer_num)):
|
||||
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
|
||||
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16)
|
||||
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16)
|
||||
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
|
||||
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=torch.float16)
|
||||
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=torch.float16)
|
||||
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)
|
||||
|
||||
init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
|
||||
@@ -194,11 +195,11 @@ def bench_mla(quant_mode: str):
|
||||
init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
|
||||
init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
|
||||
init.normal_(o_proj.weight, mean=0.0, std=0.02)
|
||||
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to('cpu').contiguous()
|
||||
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to('cpu').contiguous()
|
||||
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(torch.float16).contiguous()
|
||||
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to('cpu').contiguous()
|
||||
o_proj_weight = o_proj.weight.to(torch.float16).to('cpu').contiguous()
|
||||
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous()
|
||||
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous()
|
||||
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous()
|
||||
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous()
|
||||
o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous()
|
||||
|
||||
config = kt_kernel_ext.mla.MLAConfig(
|
||||
hidden_size,
|
||||
@@ -210,7 +211,7 @@ def bench_mla(quant_mode: str):
|
||||
)
|
||||
config.max_qlen = max_qlen
|
||||
config.max_kvlen = max_kvlen
|
||||
config.max_position_embeddings = max_position_embeddings
|
||||
config.max_position_embeddings = max_position_embeddings
|
||||
config.rope_scaling_factor = rope_scaling["factor"]
|
||||
config.rope_theta = rope_theta
|
||||
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
|
||||
@@ -231,64 +232,85 @@ def bench_mla(quant_mode: str):
|
||||
config.kv_b_proj_type = ggml_type.FP16
|
||||
config.w_o_type = ggml_type.FP16
|
||||
|
||||
|
||||
config.pool = CPUInfer.backend_
|
||||
|
||||
|
||||
|
||||
mla = kt_kernel_ext.mla.MLA(config)
|
||||
mla.load_weights()
|
||||
mla.set_local_pages(pages_count)
|
||||
mlas.append(mla)
|
||||
|
||||
print('Generating data...')
|
||||
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
|
||||
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
|
||||
|
||||
print('Warming up...')
|
||||
print("Generating data...")
|
||||
input_tensor = (
|
||||
torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
|
||||
)
|
||||
output_tensor = (
|
||||
torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").to("cpu").contiguous()
|
||||
)
|
||||
|
||||
print("Warming up...")
|
||||
|
||||
for i in tqdm(range(warm_up_iter)):
|
||||
mlas[i%layer_num].forward([qlen],[page_table],[kvlen],
|
||||
input_tensor[i%layer_num].data_ptr(),output_tensor[i%layer_num].data_ptr())
|
||||
mlas[i % layer_num].forward(
|
||||
[qlen],
|
||||
[page_table],
|
||||
[kvlen],
|
||||
input_tensor[i % layer_num].data_ptr(),
|
||||
output_tensor[i % layer_num].data_ptr(),
|
||||
)
|
||||
|
||||
|
||||
print('Start testing...')
|
||||
print("Start testing...")
|
||||
|
||||
start = time.perf_counter()
|
||||
for i in tqdm(range(test_iter)):
|
||||
mlas[i%layer_num].forward([qlen],[page_table],[kvlen],
|
||||
input_tensor[i%layer_num].data_ptr(),output_tensor[i%layer_num].data_ptr())
|
||||
mlas[i % layer_num].forward(
|
||||
[qlen],
|
||||
[page_table],
|
||||
[kvlen],
|
||||
input_tensor[i % layer_num].data_ptr(),
|
||||
output_tensor[i % layer_num].data_ptr(),
|
||||
)
|
||||
|
||||
end = time.perf_counter()
|
||||
total_time = end - start
|
||||
|
||||
time_per_iter_us = (total_time * 1e6) / test_iter
|
||||
bandwidth = bytes_per_elem * (q_lora_rank * hidden_size
|
||||
+ (kv_lora_rank+rope_size) * hidden_size
|
||||
+ (nope_size+rope_size) * q_lora_rank * num_heads
|
||||
+ (nope_size+nope_size)*kv_lora_rank * num_heads
|
||||
+ hidden_size * nope_size * num_heads
|
||||
+ hidden_size * qlen) * test_iter / (total_time * 1e9)
|
||||
flops = 2*(
|
||||
q_lora_rank*hidden_size*qlen
|
||||
+ kv_lora_rank * hidden_size * qlen
|
||||
+num_heads* (nope_size+rope_size)*q_lora_rank*qlen
|
||||
+ num_heads * qlen * nope_size * kv_lora_rank
|
||||
+ num_heads * (kvlen+qlen) * kv_lora_rank * qlen
|
||||
+ num_heads * rope_size * qlen * (qlen+kvlen)
|
||||
+ num_heads * kv_lora_rank * (qlen + kvlen) * qlen
|
||||
+ num_heads * nope_size * kv_lora_rank * qlen
|
||||
+ hidden_size * num_heads* nope_size * qlen
|
||||
) * test_iter / (total_time * 1e12)
|
||||
bandwidth = (
|
||||
bytes_per_elem
|
||||
* (
|
||||
q_lora_rank * hidden_size
|
||||
+ (kv_lora_rank + rope_size) * hidden_size
|
||||
+ (nope_size + rope_size) * q_lora_rank * num_heads
|
||||
+ (nope_size + nope_size) * kv_lora_rank * num_heads
|
||||
+ hidden_size * nope_size * num_heads
|
||||
+ hidden_size * qlen
|
||||
)
|
||||
* test_iter
|
||||
/ (total_time * 1e9)
|
||||
)
|
||||
flops = (
|
||||
2
|
||||
* (
|
||||
q_lora_rank * hidden_size * qlen
|
||||
+ kv_lora_rank * hidden_size * qlen
|
||||
+ num_heads * (nope_size + rope_size) * q_lora_rank * qlen
|
||||
+ num_heads * qlen * nope_size * kv_lora_rank
|
||||
+ num_heads * (kvlen + qlen) * kv_lora_rank * qlen
|
||||
+ num_heads * rope_size * qlen * (qlen + kvlen)
|
||||
+ num_heads * kv_lora_rank * (qlen + kvlen) * qlen
|
||||
+ num_heads * nope_size * kv_lora_rank * qlen
|
||||
+ hidden_size * num_heads * nope_size * qlen
|
||||
)
|
||||
* test_iter
|
||||
/ (total_time * 1e12)
|
||||
)
|
||||
|
||||
|
||||
print('Quant mode:', quant_mode)
|
||||
print('Time(s):', total_time)
|
||||
print('Iteration:', test_iter)
|
||||
print('Time(us) per iteration:', time_per_iter_us)
|
||||
print('Bandwidth:', bandwidth, 'GB/s')
|
||||
print('TFLOPS:', flops)
|
||||
print('')
|
||||
print("Quant mode:", quant_mode)
|
||||
print("Time(s):", total_time)
|
||||
print("Iteration:", test_iter)
|
||||
print("Time(us) per iteration:", time_per_iter_us)
|
||||
print("Bandwidth:", bandwidth, "GB/s")
|
||||
print("TFLOPS:", flops)
|
||||
print("")
|
||||
|
||||
# 整理测试结果
|
||||
result = {
|
||||
@@ -301,7 +323,7 @@ def bench_mla(quant_mode: str):
|
||||
"flops_TFLOPS": flops,
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
|
||||
"test_parameters": {
|
||||
"qlen": qlen,
|
||||
"qlen": qlen,
|
||||
"kvlen": kvlen,
|
||||
"page_table": page_table,
|
||||
"page_size": page_size,
|
||||
@@ -312,21 +334,16 @@ def bench_mla(quant_mode: str):
|
||||
"q_lora_rank": q_lora_rank,
|
||||
"nope_size": nope_size,
|
||||
"rope_size": rope_size,
|
||||
|
||||
|
||||
"layer_num": layer_num,
|
||||
|
||||
"rope_theta": rope_theta,
|
||||
"max_qlen": max_qlen,
|
||||
"max_kvlen": max_kvlen,
|
||||
"max_position_embeddings": max_position_embeddings,
|
||||
|
||||
"rope_scaling": rope_scaling,
|
||||
|
||||
"warm_up_iter": warm_up_iter,
|
||||
"test_iter": test_iter,
|
||||
"CPUInfer_parameter": CPUINFER_PARAM
|
||||
}
|
||||
"CPUInfer_parameter": CPUINFER_PARAM,
|
||||
},
|
||||
}
|
||||
# 添加 git 与系统信息
|
||||
result.update(get_git_commit())
|
||||
@@ -334,6 +351,6 @@ def bench_mla(quant_mode: str):
|
||||
# 将结果记录到 JSON 文件中
|
||||
print(result)
|
||||
record_results(result)
|
||||
|
||||
|
||||
bench_mla("fp16")
|
||||
|
||||
bench_mla("fp16")
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Description :
|
||||
"""
|
||||
Description :
|
||||
Author : chenht2022
|
||||
Date : 2024-07-16 10:43:18
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-08-06 10:36:04
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys
|
||||
import time
|
||||
sys.path.append(os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
hidden_size = 5120
|
||||
@@ -25,94 +26,108 @@ CPUInfer = kt_kernel_ext.CPUInfer(64)
|
||||
warm_up_iter = 1000
|
||||
test_iter = 10000
|
||||
|
||||
|
||||
def bench_mlp(quant_mode: str):
|
||||
with torch.inference_mode(mode=True):
|
||||
|
||||
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
hidden_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
if quant_mode == "fp32":
|
||||
gate_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
up_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
down_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
gate_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
up_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
down_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
bytes_per_elem = 4.000000
|
||||
elif quant_mode == "fp16":
|
||||
gate_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
up_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
down_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
gate_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
up_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
down_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
bytes_per_elem = 2.000000
|
||||
elif quant_mode == "bf16":
|
||||
gate_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
up_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
down_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
gate_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
up_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
down_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
bytes_per_elem = 2.000000
|
||||
elif quant_mode == "q8_0":
|
||||
gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
up_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
down_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
up_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
down_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
bytes_per_elem = 1.062500
|
||||
elif quant_mode == "q6_k":
|
||||
gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
up_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
up_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
bytes_per_elem = 0.820312
|
||||
elif quant_mode == "q5_k_m":
|
||||
gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
up_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
up_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
bytes_per_elem = 0.731771
|
||||
elif quant_mode == "q4_k_m":
|
||||
gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
up_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
up_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
bytes_per_elem = 0.648437
|
||||
elif quant_mode == "q3_k_m":
|
||||
gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
up_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
down_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
up_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
down_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
bytes_per_elem = 0.515625
|
||||
elif quant_mode == "q2_k":
|
||||
gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
|
||||
up_type = 10 # ggml_type::GGML_TYPE_Q2_K
|
||||
down_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
|
||||
up_type = 10 # ggml_type::GGML_TYPE_Q2_K
|
||||
down_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
bytes_per_elem = 0.328125
|
||||
elif quant_mode == "iq3_xs":
|
||||
gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
up_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
down_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
bytes_per_elem = 0.429688
|
||||
elif quant_mode == "iq2_xxs":
|
||||
gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
up_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
down_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
bytes_per_elem = 0.257812
|
||||
else:
|
||||
assert(False)
|
||||
|
||||
assert False
|
||||
|
||||
mlps = []
|
||||
gate_projs = []
|
||||
up_projs = []
|
||||
down_projs = []
|
||||
for _ in range(layer_num):
|
||||
gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
config = kt_kernel_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
|
||||
gate_proj = (
|
||||
torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
||||
)
|
||||
up_proj = (
|
||||
torch.randn((intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
||||
)
|
||||
down_proj = (
|
||||
torch.randn((hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
||||
)
|
||||
config = kt_kernel_ext.mlp.MLPConfig(
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
stride,
|
||||
group_max_len,
|
||||
gate_proj.data_ptr(),
|
||||
up_proj.data_ptr(),
|
||||
down_proj.data_ptr(),
|
||||
gate_type,
|
||||
up_type,
|
||||
down_type,
|
||||
hidden_type,
|
||||
)
|
||||
mlp = kt_kernel_ext.mlp.MLP(config)
|
||||
gate_projs.append(gate_proj)
|
||||
up_projs.append(up_proj)
|
||||
down_projs.append(down_proj)
|
||||
mlps.append(mlp)
|
||||
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
|
||||
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
||||
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
||||
|
||||
# warm up
|
||||
for i in range(warm_up_iter):
|
||||
CPUInfer.submit(
|
||||
mlps[i % layer_num].forward(
|
||||
qlen,
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
|
||||
)
|
||||
CPUInfer.sync()
|
||||
|
||||
@@ -120,21 +135,22 @@ def bench_mlp(quant_mode: str):
|
||||
start = time.perf_counter()
|
||||
for i in range(test_iter):
|
||||
CPUInfer.submit(
|
||||
mlps[i % layer_num].forward(
|
||||
qlen,
|
||||
input[i % layer_num].data_ptr(),
|
||||
output[i % layer_num].data_ptr()
|
||||
)
|
||||
mlps[i % layer_num].forward(qlen, input[i % layer_num].data_ptr(), output[i % layer_num].data_ptr())
|
||||
)
|
||||
CPUInfer.sync()
|
||||
end = time.perf_counter()
|
||||
total_time = end - start
|
||||
print('Quant mode: ', quant_mode)
|
||||
print('Time(s): ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
print('Time(us) per iteration: ', total_time / test_iter * 1000000)
|
||||
print('Bandwidth: ', hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
|
||||
print('')
|
||||
print("Quant mode: ", quant_mode)
|
||||
print("Time(s): ", total_time)
|
||||
print("Iteration: ", test_iter)
|
||||
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
|
||||
print(
|
||||
"Bandwidth: ",
|
||||
hidden_size * intermediate_size * 3 * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000,
|
||||
"GB/s",
|
||||
)
|
||||
print("")
|
||||
|
||||
|
||||
bench_mlp("fp32")
|
||||
bench_mlp("fp16")
|
||||
|
||||
@@ -5,8 +5,8 @@ import json
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
|
||||
import kt_kernel_ext
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -35,7 +35,7 @@ CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)
|
||||
script_path = os.path.abspath(__file__)
|
||||
script_dir = os.path.dirname(script_path)
|
||||
script_name = os.path.splitext(os.path.basename(script_path))[0]
|
||||
json_path = os.path.join(script_dir, "bench_results "+ ".jsonl")
|
||||
json_path = os.path.join(script_dir, "bench_results " + ".jsonl")
|
||||
|
||||
|
||||
def get_git_commit():
|
||||
@@ -76,9 +76,9 @@ def get_system_info():
|
||||
|
||||
# 获取 CPU 型号(仅 Linux 支持)
|
||||
cpu_model = None
|
||||
if os.path.exists('/proc/cpuinfo'):
|
||||
if os.path.exists("/proc/cpuinfo"):
|
||||
try:
|
||||
with open('/proc/cpuinfo', 'r') as f:
|
||||
with open("/proc/cpuinfo", "r") as f:
|
||||
for line in f:
|
||||
if "model name" in line:
|
||||
cpu_model = line.split(":", 1)[1].strip()
|
||||
@@ -89,9 +89,9 @@ def get_system_info():
|
||||
|
||||
# 获取内存大小(单位:GB),仅 Linux 支持
|
||||
mem_total_gb = None
|
||||
if os.path.exists('/proc/meminfo'):
|
||||
if os.path.exists("/proc/meminfo"):
|
||||
try:
|
||||
with open('/proc/meminfo', 'r') as f:
|
||||
with open("/proc/meminfo", "r") as f:
|
||||
for line in f:
|
||||
if "MemTotal" in line:
|
||||
mem_kb = float(line.split(":", 1)[1].split()[0])
|
||||
@@ -134,57 +134,57 @@ def bench_moe(quant_mode: str):
|
||||
# 根据量化模式设置数据类型与 bytes_per_elem
|
||||
hidden_type = 30 # ggml_type::GGML_TYPE_BF16(固定)
|
||||
if quant_mode == "fp32":
|
||||
gate_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
gate_type = 0 # ggml_type::GGML_TYPE_F32
|
||||
up_type = 0
|
||||
down_type = 0
|
||||
bytes_per_elem = 4.0
|
||||
elif quant_mode == "fp16":
|
||||
gate_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
gate_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
up_type = 1
|
||||
down_type = 1
|
||||
bytes_per_elem = 2.0
|
||||
elif quant_mode == "bf16":
|
||||
gate_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
gate_type = 30 # ggml_type::GGML_TYPE_BF16
|
||||
up_type = 30
|
||||
down_type = 30
|
||||
bytes_per_elem = 2.0
|
||||
elif quant_mode == "q8_0":
|
||||
gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
gate_type = 8 # ggml_type::GGML_TYPE_Q8_0
|
||||
up_type = 8
|
||||
down_type = 8
|
||||
bytes_per_elem = 1.062500
|
||||
elif quant_mode == "q6_k":
|
||||
gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
gate_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
up_type = 14
|
||||
down_type = 14
|
||||
bytes_per_elem = 0.820312
|
||||
elif quant_mode == "q5_k_m":
|
||||
gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
gate_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
up_type = 13
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
bytes_per_elem = 0.731771
|
||||
elif quant_mode == "q4_k_m":
|
||||
gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
gate_type = 12 # ggml_type::GGML_TYPE_Q4_K
|
||||
up_type = 12
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
down_type = 14 # ggml_type::GGML_TYPE_Q6_K
|
||||
bytes_per_elem = 0.648437
|
||||
elif quant_mode == "q3_k_m":
|
||||
gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
gate_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
up_type = 11
|
||||
down_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
down_type = 13 # ggml_type::GGML_TYPE_Q5_K
|
||||
bytes_per_elem = 0.515625
|
||||
elif quant_mode == "q2_k":
|
||||
gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
|
||||
gate_type = 10 # ggml_type::GGML_TYPE_Q2_K
|
||||
up_type = 10
|
||||
down_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
down_type = 11 # ggml_type::GGML_TYPE_Q3_K
|
||||
bytes_per_elem = 0.328125
|
||||
elif quant_mode == "iq3_xs":
|
||||
gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
gate_type = 21 # ggml_type::GGML_TYPE_IQ3_S
|
||||
up_type = 21
|
||||
down_type = 21
|
||||
bytes_per_elem = 0.429688
|
||||
elif quant_mode == "iq2_xxs":
|
||||
gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
gate_type = 16 # ggml_type::GGML_TYPE_IQ2_XXS
|
||||
up_type = 16
|
||||
down_type = 16
|
||||
bytes_per_elem = 0.257812
|
||||
@@ -194,13 +194,25 @@ def bench_moe(quant_mode: str):
|
||||
# 构建各层 MoE 模型
|
||||
moes = []
|
||||
for _ in tqdm(range(layer_num), desc="Initializing MOEs"):
|
||||
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
|
||||
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
|
||||
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu").to("cpu").contiguous()
|
||||
|
||||
gate_proj = (
|
||||
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
up_proj = (
|
||||
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float16, device="cpu")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
down_proj = (
|
||||
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float16, device="cpu")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
|
||||
config.pool = CPUInfer.backend_
|
||||
config.m_block = m_block
|
||||
config.m_block = m_block
|
||||
config.group_min_len = group_min_len
|
||||
config.group_max_len = group_max_len
|
||||
config.gate_proj = gate_proj.data_ptr()
|
||||
@@ -215,47 +227,52 @@ def bench_moe(quant_mode: str):
|
||||
CPUInfer.submit(moe.load_weights_task())
|
||||
CPUInfer.sync()
|
||||
moes.append(moe)
|
||||
|
||||
|
||||
# 生成输入数据
|
||||
print('Generating data...')
|
||||
print("Generating data...")
|
||||
# 专家路由索引与权重,每层一个
|
||||
gen_iter = 1000
|
||||
expert_ids = torch.rand(gen_iter * qlen , expert_num, device="cpu").argsort(dim=-1)[:, :num_experts_per_tok].reshape(gen_iter, qlen * num_experts_per_tok).contiguous()
|
||||
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
|
||||
expert_ids = (
|
||||
torch.rand(gen_iter * qlen, expert_num, device="cpu")
|
||||
.argsort(dim=-1)[:, :num_experts_per_tok]
|
||||
.reshape(gen_iter, qlen * num_experts_per_tok)
|
||||
.contiguous()
|
||||
)
|
||||
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").contiguous()
|
||||
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
|
||||
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cpu").contiguous()
|
||||
# 将 qlen 封装成 tensor,用于 forward 调用
|
||||
qlen_tensor = torch.tensor([qlen], dtype=torch.int32)
|
||||
|
||||
# 预热阶段
|
||||
print('Warming up...')
|
||||
print("Warming up...")
|
||||
for i in tqdm(range(warm_up_iter), desc="Warm-up"):
|
||||
CPUInfer.submit(
|
||||
moes[i % layer_num].forward_task(
|
||||
qlen_tensor.data_ptr(),
|
||||
num_experts_per_tok,
|
||||
expert_ids[i%gen_iter].data_ptr(),
|
||||
weights[i%gen_iter].data_ptr(),
|
||||
expert_ids[i % gen_iter].data_ptr(),
|
||||
weights[i % gen_iter].data_ptr(),
|
||||
input_tensor[i % layer_num].data_ptr(),
|
||||
output_tensor[i % layer_num].data_ptr(),
|
||||
False
|
||||
False,
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
|
||||
# 测试阶段
|
||||
print('Start testing...')
|
||||
print("Start testing...")
|
||||
start = time.perf_counter()
|
||||
for i in tqdm(range(test_iter), desc="Testing"):
|
||||
CPUInfer.submit(
|
||||
moes[i % layer_num].forward_task(
|
||||
qlen_tensor.data_ptr(),
|
||||
num_experts_per_tok,
|
||||
expert_ids[i%gen_iter].data_ptr(),
|
||||
weights[i%gen_iter].data_ptr(),
|
||||
expert_ids[i % gen_iter].data_ptr(),
|
||||
weights[i % gen_iter].data_ptr(),
|
||||
input_tensor[i % layer_num].data_ptr(),
|
||||
output_tensor[i % layer_num].data_ptr(),
|
||||
False
|
||||
False,
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
@@ -264,17 +281,29 @@ def bench_moe(quant_mode: str):
|
||||
|
||||
# 计算性能指标
|
||||
time_per_iter_us = total_time / test_iter * 1e6
|
||||
bandwidth = hidden_size * intermediate_size * 3 * num_experts_per_tok * (1/8 * 256 * (1-(31/32)**qlen)) * bytes_per_elem * test_iter / total_time / 1e9 # 单位:GB/s
|
||||
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 # 单位:TFLOPS
|
||||
bandwidth = (
|
||||
hidden_size
|
||||
* intermediate_size
|
||||
* 3
|
||||
* num_experts_per_tok
|
||||
* (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
|
||||
* bytes_per_elem
|
||||
* test_iter
|
||||
/ total_time
|
||||
/ 1e9
|
||||
) # 单位:GB/s
|
||||
flops = (
|
||||
hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
|
||||
) # 单位:TFLOPS
|
||||
|
||||
# 打印结果
|
||||
print('Quant mode:', quant_mode)
|
||||
print('Time(s):', total_time)
|
||||
print('Iteration:', test_iter)
|
||||
print('Time(us) per iteration:', time_per_iter_us)
|
||||
print('Bandwidth:', bandwidth, 'GB/s')
|
||||
print('TFLOPS:', flops)
|
||||
print('')
|
||||
print("Quant mode:", quant_mode)
|
||||
print("Time(s):", total_time)
|
||||
print("Iteration:", test_iter)
|
||||
print("Time(us) per iteration:", time_per_iter_us)
|
||||
print("Bandwidth:", bandwidth, "GB/s")
|
||||
print("TFLOPS:", flops)
|
||||
print("")
|
||||
|
||||
# 整理测试结果
|
||||
result = {
|
||||
@@ -298,8 +327,8 @@ def bench_moe(quant_mode: str):
|
||||
"qlen": qlen,
|
||||
"warm_up_iter": warm_up_iter,
|
||||
"test_iter": test_iter,
|
||||
"CPUInfer_parameter": CPUINFER_PARAM
|
||||
}
|
||||
"CPUInfer_parameter": CPUINFER_PARAM,
|
||||
},
|
||||
}
|
||||
# 添加 git 与系统信息
|
||||
result.update(get_git_commit())
|
||||
@@ -321,4 +350,4 @@ if __name__ == "__main__":
|
||||
# bench_moe("q3_k_m", layer_num)
|
||||
# bench_moe("q2_k", layer_num)
|
||||
# bench_moe("iq3_xs", layer_num)
|
||||
# bench_moe("iq2_xxs", layer_num)
|
||||
# bench_moe("iq2_xxs", layer_num)
|
||||
|
||||
@@ -15,7 +15,7 @@ from tqdm import tqdm
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
import torch
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import numpy as np
|
||||
|
||||
# 测试参数设置
|
||||
|
||||
@@ -1,19 +1,20 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Description :
|
||||
"""
|
||||
Description :
|
||||
Author : chenht2022
|
||||
Date : 2024-07-25 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-08-06 10:41:28
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys, time, json, subprocess, platform
|
||||
|
||||
from tqdm import tqdm
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'build'))
|
||||
import kt_kernel_ext
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
@@ -21,33 +22,28 @@ import numpy as np
|
||||
expert_num = 256
|
||||
hidden_size = 7168
|
||||
intermediate_size = 2048
|
||||
max_len = 25600
|
||||
max_len = 25600
|
||||
num_experts_per_tok = 8
|
||||
layer_num = 4
|
||||
qlen = 1024
|
||||
# qlen = 1
|
||||
# qlen = 1
|
||||
warm_up_iter = 1000
|
||||
test_iter = 5000
|
||||
k_group_size = 128
|
||||
|
||||
physical_to_logical_map = torch.tensor(
|
||||
data=range(expert_num),
|
||||
device="cpu",
|
||||
dtype=torch.int64
|
||||
).contiguous()
|
||||
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
|
||||
# 将 CPUInfer 参数设为变量
|
||||
# CPUINFER_PARAM = 257
|
||||
# CPUInfer = kt_kernel_ext.CPUInfer(CPUINFER_PARAM)
|
||||
|
||||
worker_config = kt_kernel_ext.WorkerPoolConfig()
|
||||
worker_config.subpool_count = 2
|
||||
worker_config.subpool_numa_map= [0,1]
|
||||
worker_config.subpool_thread_count = [40,40]
|
||||
worker_config.subpool_numa_map = [0, 1]
|
||||
worker_config.subpool_thread_count = [40, 40]
|
||||
CPUINFER_PARAM = 80
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
|
||||
|
||||
|
||||
|
||||
def get_git_commit():
|
||||
"""
|
||||
获取当前 git 提交记录(commit hash 和提交信息),
|
||||
@@ -82,14 +78,14 @@ def get_system_info():
|
||||
info = {}
|
||||
# 系统名称及主机名
|
||||
uname = platform.uname()
|
||||
info["system_name"] = uname.system # 如 Linux, Windows 等
|
||||
info["node_name"] = uname.node # 主机名称
|
||||
info["system_name"] = uname.system # 如 Linux, Windows 等
|
||||
info["node_name"] = uname.node # 主机名称
|
||||
|
||||
# 获取 CPU 型号(仅 Linux 支持)
|
||||
cpu_model = None
|
||||
if os.path.exists('/proc/cpuinfo'):
|
||||
if os.path.exists("/proc/cpuinfo"):
|
||||
try:
|
||||
with open('/proc/cpuinfo', 'r') as f:
|
||||
with open("/proc/cpuinfo", "r") as f:
|
||||
for line in f:
|
||||
if "model name" in line:
|
||||
cpu_model = line.split(":", 1)[1].strip()
|
||||
@@ -100,9 +96,9 @@ def get_system_info():
|
||||
|
||||
# 获取内存大小(单位:GB),仅 Linux 支持
|
||||
mem_total_gb = None
|
||||
if os.path.exists('/proc/meminfo'):
|
||||
if os.path.exists("/proc/meminfo"):
|
||||
try:
|
||||
with open('/proc/meminfo', 'r') as f:
|
||||
with open("/proc/meminfo", "r") as f:
|
||||
for line in f:
|
||||
if "MemTotal" in line:
|
||||
mem_kb = float(line.split(":", 1)[1].split()[0])
|
||||
@@ -130,11 +126,13 @@ def get_system_info():
|
||||
|
||||
return info
|
||||
|
||||
|
||||
script_path = os.path.abspath(__file__)
|
||||
script_dir = os.path.dirname(script_path)
|
||||
script_name = os.path.splitext(os.path.basename(script_path))[0]
|
||||
json_path = os.path.join(script_dir, script_name + ".jsonl")
|
||||
|
||||
|
||||
def record_results(result, filename=json_path):
|
||||
"""
|
||||
将结果以 JSON 格式追加到文件中
|
||||
@@ -142,6 +140,7 @@ def record_results(result, filename=json_path):
|
||||
with open(filename, "a") as f:
|
||||
f.write(json.dumps(result) + "\n")
|
||||
|
||||
|
||||
def bench_moe(quant_mode: str):
|
||||
with torch.inference_mode():
|
||||
if quant_mode == "bf16":
|
||||
@@ -160,11 +159,22 @@ def bench_moe(quant_mode: str):
|
||||
up_projs = []
|
||||
down_projs = []
|
||||
for layer_index in range(layer_num):
|
||||
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
||||
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
||||
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
expert_num, num_experts_per_tok, hidden_size, intermediate_size,0)
|
||||
gate_proj = (
|
||||
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
up_proj = (
|
||||
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
down_proj = (
|
||||
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
|
||||
config.max_len = max_len
|
||||
config.gate_proj = gate_proj.data_ptr()
|
||||
config.up_proj = up_proj.data_ptr()
|
||||
@@ -189,10 +199,22 @@ def bench_moe(quant_mode: str):
|
||||
down_projs.append(down_proj)
|
||||
moes.append(moe)
|
||||
gen_iter = 3000
|
||||
expert_ids = torch.rand(gen_iter * qlen , expert_num, device="cpu").argsort(dim=-1)[:, :num_experts_per_tok].reshape(gen_iter, qlen * num_experts_per_tok).to("cpu").contiguous()
|
||||
weights = torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
|
||||
input_tensor = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
||||
output_tensor = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
||||
expert_ids = (
|
||||
torch.rand(gen_iter * qlen, expert_num, device="cpu")
|
||||
.argsort(dim=-1)[:, :num_experts_per_tok]
|
||||
.reshape(gen_iter, qlen * num_experts_per_tok)
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
weights = (
|
||||
torch.rand((gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu").to("cpu").contiguous()
|
||||
)
|
||||
input_tensor = (
|
||||
torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
||||
)
|
||||
output_tensor = (
|
||||
torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
||||
)
|
||||
bsz_tensor = torch.tensor([qlen], device="cpu")
|
||||
|
||||
# 预热迭代
|
||||
@@ -203,8 +225,8 @@ def bench_moe(quant_mode: str):
|
||||
moes[i % layer_num].forward_task(
|
||||
bsz_tensor.data_ptr(),
|
||||
num_experts_per_tok,
|
||||
expert_ids[i%gen_iter].data_ptr(),
|
||||
weights[i%gen_iter].data_ptr(),
|
||||
expert_ids[i % gen_iter].data_ptr(),
|
||||
weights[i % gen_iter].data_ptr(),
|
||||
input_tensor[i % layer_num].data_ptr(),
|
||||
output_tensor[i % layer_num].data_ptr(),
|
||||
False,
|
||||
@@ -224,8 +246,8 @@ def bench_moe(quant_mode: str):
|
||||
moes[i % layer_num].forward_task(
|
||||
bsz_tensor.data_ptr(),
|
||||
num_experts_per_tok,
|
||||
expert_ids[i%gen_iter].data_ptr(),
|
||||
weights[i%gen_iter].data_ptr(),
|
||||
expert_ids[i % gen_iter].data_ptr(),
|
||||
weights[i % gen_iter].data_ptr(),
|
||||
input_tensor[i % layer_num].data_ptr(),
|
||||
output_tensor[i % layer_num].data_ptr(),
|
||||
False,
|
||||
@@ -239,16 +261,28 @@ def bench_moe(quant_mode: str):
|
||||
|
||||
# 计算性能指标
|
||||
time_per_iter_us = total_time / test_iter * 1e6
|
||||
bandwidth = hidden_size * intermediate_size * 3 * num_experts_per_tok * (1/8 * 256 * (1-(31/32)**qlen)) * bytes_per_elem * test_iter / total_time / 1e9 # 单位:GB/s
|
||||
flops = hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12 # 单位:TFLOPS
|
||||
bandwidth = (
|
||||
hidden_size
|
||||
* intermediate_size
|
||||
* 3
|
||||
* num_experts_per_tok
|
||||
* (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
|
||||
* bytes_per_elem
|
||||
* test_iter
|
||||
/ total_time
|
||||
/ 1e9
|
||||
) # 单位:GB/s
|
||||
flops = (
|
||||
hidden_size * intermediate_size * qlen * 3 * num_experts_per_tok * 2 * test_iter / total_time / 1e12
|
||||
) # 单位:TFLOPS
|
||||
|
||||
print('Quant mode: ', quant_mode)
|
||||
print('Time(s): ', total_time)
|
||||
print('Iteration: ', test_iter)
|
||||
print('Time(us) per iteration: ', time_per_iter_us)
|
||||
print('Bandwidth: ', bandwidth, 'GB/s')
|
||||
print('Flops: ', flops, 'TFLOPS')
|
||||
print('')
|
||||
print("Quant mode: ", quant_mode)
|
||||
print("Time(s): ", total_time)
|
||||
print("Iteration: ", test_iter)
|
||||
print("Time(us) per iteration: ", time_per_iter_us)
|
||||
print("Bandwidth: ", bandwidth, "GB/s")
|
||||
print("Flops: ", flops, "TFLOPS")
|
||||
print("")
|
||||
|
||||
# 整理结果记录,包括测试参数
|
||||
result = {
|
||||
@@ -270,8 +304,8 @@ def bench_moe(quant_mode: str):
|
||||
"warm_up_iter": warm_up_iter,
|
||||
"test_iter": test_iter,
|
||||
"CPUInfer_parameter": CPUINFER_PARAM,
|
||||
"k_group_size": k_group_size
|
||||
}
|
||||
"k_group_size": k_group_size,
|
||||
},
|
||||
}
|
||||
# 添加 git 提交记录信息
|
||||
result.update(get_git_commit())
|
||||
@@ -280,9 +314,10 @@ def bench_moe(quant_mode: str):
|
||||
# 将结果以 JSON 形式追加到文件中
|
||||
record_results(result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 选择需要测试的量化模式
|
||||
# bench_moe("bf16")
|
||||
# bench_moe("int8")
|
||||
# bench_moe("int4")
|
||||
bench_moe("int4_1k")
|
||||
bench_moe("int4_1k")
|
||||
|
||||
@@ -14,7 +14,7 @@ import os, sys, time, json, subprocess, platform
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
import torch
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ os.environ.setdefault("BLAS_NUM_THREADS", "1")
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
|
||||
import torch # noqa: E402
|
||||
import kt_kernel_ext as ce # noqa: E402
|
||||
from kt_kernel import kt_kernel_ext as ce # noqa: E402
|
||||
from tqdm import tqdm # noqa: E402
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ import os, sys, time, json, subprocess, platform
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
import torch
|
||||
import ctypes
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.moe import MOEConfig, MOE, AMXBF16_MOE, AMXInt8_MOE, AMXInt4_MOE, AMXInt4_1_MOE
|
||||
|
||||
intermediate_size_full = 2048
|
||||
@@ -14,20 +15,14 @@ num_experts_per_tok = 8
|
||||
cpu_infer = kt_kernel_ext.CPUInfer(97)
|
||||
|
||||
up = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch.bfloat16, device="cpu")
|
||||
|
||||
|
||||
gate = torch.empty(experts_num, intermediate_size_full, hidden_size, dtype=torch.bfloat16, device="cpu")
|
||||
|
||||
|
||||
down = torch.empty(experts_num, hidden_size, intermediate_size_full, dtype=torch.bfloat16, device="cpu")
|
||||
|
||||
gate_ptr = ctypes.addressof(
|
||||
ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
|
||||
)
|
||||
up_ptr = ctypes.addressof(
|
||||
ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
|
||||
)
|
||||
down_ptr = ctypes.addressof(
|
||||
ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents
|
||||
)
|
||||
gate_ptr = ctypes.addressof(ctypes.cast(gate.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
up_ptr = ctypes.addressof(ctypes.cast(up.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
down_ptr = ctypes.addressof(ctypes.cast(down.data_ptr(), ctypes.POINTER(ctypes.c_uint64)).contents)
|
||||
moe_config = MOEConfig(
|
||||
experts_num,
|
||||
num_experts_per_tok,
|
||||
@@ -36,9 +31,9 @@ moe_config = MOEConfig(
|
||||
)
|
||||
moe_config.layer_idx = 45
|
||||
moe_config.pool = cpu_infer.backend_
|
||||
moe_config.max_len = 1024 #TODO(zbx): multi cuda graph
|
||||
moe_config.max_len = 1024 # TODO(zbx): multi cuda graph
|
||||
moe_config.gate_proj = gate_ptr
|
||||
moe_config.up_proj = up_ptr
|
||||
moe_config.down_proj = down_ptr
|
||||
moe_config.path = ""
|
||||
moe = AMXInt4_MOE(moe_config)
|
||||
moe = AMXInt4_MOE(moe_config)
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
"""
|
||||
Description :
|
||||
Description :
|
||||
Author : Jianwei Dong
|
||||
Date : 2024-08-28 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-08-28 10:32:05
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys
|
||||
import time
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/../build")
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from flash_attn import flash_attn_with_kvcache
|
||||
import torch
|
||||
|
||||
@@ -59,19 +59,11 @@ with torch.inference_mode(mode=True):
|
||||
local_kvcache = kt_kernel_ext.kvcache.KVCache(config)
|
||||
|
||||
kvcaches = []
|
||||
block_table = (
|
||||
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
|
||||
.contiguous()
|
||||
.view(1, -1)
|
||||
)
|
||||
block_table = torch.arange(max_block_num, dtype=torch.int32, device="cpu").contiguous().view(1, -1)
|
||||
|
||||
for layer_idx in range(layer_num):
|
||||
k_cache = torch.randn(
|
||||
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
|
||||
).contiguous()
|
||||
v_cache = torch.randn(
|
||||
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
|
||||
).contiguous()
|
||||
k_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
|
||||
v_cache = torch.randn((1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
|
||||
|
||||
CPUInfer.submit(
|
||||
local_kvcache.update_kvcache_fp16(
|
||||
@@ -94,17 +86,11 @@ with torch.inference_mode(mode=True):
|
||||
|
||||
k_cache = kvcaches[i % layer_num][0]
|
||||
v_cache = kvcaches[i % layer_num][1]
|
||||
input = torch.randn(
|
||||
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
|
||||
).contiguous()
|
||||
output = torch.empty(
|
||||
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
|
||||
).contiguous()
|
||||
input = torch.randn((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
|
||||
output = torch.empty((1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu").contiguous()
|
||||
|
||||
# attn_lse: (bsz, q_len, q_head_num)
|
||||
attn_lse = torch.empty(
|
||||
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
|
||||
).contiguous()
|
||||
attn_lse = torch.empty((1, 1, q_head_num), dtype=torch.float32, device="cpu").contiguous()
|
||||
input = input / 100
|
||||
|
||||
CPUInfer.submit(
|
||||
@@ -135,8 +121,6 @@ with torch.inference_mode(mode=True):
|
||||
)
|
||||
# print("torch output", t_output)
|
||||
|
||||
diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
|
||||
torch.abs(t_output)
|
||||
)
|
||||
diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print("diff = ", diff)
|
||||
assert diff < 0.001
|
||||
|
||||
@@ -2,7 +2,7 @@ import os, sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
# Set fixed seed for reproducible results
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import os, sys
|
||||
import time
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
import logging
|
||||
@@ -20,6 +21,7 @@ from transformers import (
|
||||
logger = logging.getLogger("reader")
|
||||
|
||||
from gguf.gguf_reader import GGUFReader
|
||||
|
||||
# load_layers = 6
|
||||
load_layers = None
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(304)
|
||||
@@ -284,22 +286,21 @@ def build_moegate(layer_idx, json_config, gguf_weights):
|
||||
json_config["topk_group"],
|
||||
)
|
||||
|
||||
config.routed_scaling_factor = json_config['routed_scaling_factor']
|
||||
config.routed_scaling_factor = json_config["routed_scaling_factor"]
|
||||
|
||||
config.pool = CPUInfer.backend_
|
||||
|
||||
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
|
||||
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
|
||||
config.weight = weight.data_ptr()
|
||||
config.weight_type = type_to_ggml_type(weight_type)
|
||||
|
||||
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
|
||||
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
|
||||
config.e_score_correction_bias = bias.data_ptr()
|
||||
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
|
||||
|
||||
gate = kt_kernel_ext.gate.MoEGate(config)
|
||||
|
||||
|
||||
return gate
|
||||
|
||||
|
||||
|
||||
def build_llm(json_config, gguf_weights):
|
||||
@@ -312,15 +313,15 @@ def build_llm(json_config, gguf_weights):
|
||||
general_config.n_shared_experts = json_config["n_shared_experts"]
|
||||
general_config.max_qlen = max_qlen
|
||||
|
||||
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
|
||||
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
|
||||
general_config.lm_heads_ptr = lm_heads.data_ptr()
|
||||
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
|
||||
|
||||
output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
|
||||
general_config.norm_weights_ptr = output_norm.data_ptr()
|
||||
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
|
||||
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
|
||||
|
||||
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
|
||||
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
|
||||
general_config.token_embd_ptr = token_embd.data_ptr()
|
||||
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
|
||||
|
||||
@@ -330,12 +331,11 @@ def build_llm(json_config, gguf_weights):
|
||||
model = kt_kernel_ext.DeepseekV3Model(general_config)
|
||||
llm.model = model
|
||||
|
||||
|
||||
decoder_layers = []
|
||||
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
|
||||
|
||||
for i in range(real_load_layers):
|
||||
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
|
||||
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
|
||||
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
|
||||
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
|
||||
|
||||
@@ -351,11 +351,11 @@ def build_llm(json_config, gguf_weights):
|
||||
layer.ffn = build_ffn(i, json_config, gguf_weights)
|
||||
decoder_layers.append(layer)
|
||||
|
||||
model.layers = decoder_layers
|
||||
model.layers = decoder_layers
|
||||
return llm
|
||||
|
||||
|
||||
safetensor_path = '/home/bd/models/DeepSeek-R1'
|
||||
safetensor_path = "/home/bd/models/DeepSeek-R1"
|
||||
json_path = os.path.join(safetensor_path, "config.json")
|
||||
json_config = json.load(open(json_path, "r"))
|
||||
print(json_config)
|
||||
@@ -368,11 +368,11 @@ weights = dict(sorted(weights.items()))
|
||||
for name, t in weights.items():
|
||||
# if not name.startswith("blk"):
|
||||
# if name.startswith("blk.10."):
|
||||
# if "ffn_gate." in name:
|
||||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
# if "ffn_gate." in name:
|
||||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
|
||||
print("Building LLM ...")
|
||||
|
||||
print("Building LLM ...")
|
||||
load_start_time = time.perf_counter()
|
||||
llm = build_llm(json_config, weights)
|
||||
load_end_time = time.perf_counter()
|
||||
@@ -389,22 +389,20 @@ config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
|
||||
force_think = False
|
||||
|
||||
|
||||
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
|
||||
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
|
||||
|
||||
|
||||
def start_chat(content=None):
|
||||
if content is None:
|
||||
content = input("Chat: ")
|
||||
|
||||
|
||||
messages = [{"role": "user", "content": content}]
|
||||
input_tensor = tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, return_tensors="pt"
|
||||
)
|
||||
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
|
||||
if force_think:
|
||||
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
|
||||
input_tensor = torch.cat(
|
||||
[input_tensor, token_thinks], dim=1
|
||||
token_thinks = torch.tensor(
|
||||
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
|
||||
)
|
||||
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
|
||||
input_tensor = input_tensor.squeeze(0) # Add batch dimension
|
||||
|
||||
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
|
||||
@@ -415,34 +413,36 @@ def start_chat(content=None):
|
||||
stream = TextStreamer(tokenizer)
|
||||
|
||||
qlen = input_tensor.shape[0]
|
||||
qlens = [qlen-kvlen]
|
||||
qlens = [qlen - kvlen]
|
||||
kvlens = [kvlen]
|
||||
page_tables = [list(range(pages_count))]
|
||||
start_time = time.perf_counter()
|
||||
llm.forward(qlens,page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
|
||||
llm.forward(qlens, page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
|
||||
end_time = time.perf_counter()
|
||||
print(f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec")
|
||||
|
||||
print(
|
||||
f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
|
||||
)
|
||||
|
||||
logits = output_logits[0]
|
||||
# print(logits)
|
||||
# sample
|
||||
# sample
|
||||
next_token = torch.argmax(logits).item()
|
||||
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
|
||||
kvlen = input_tensor.shape[0]
|
||||
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
|
||||
|
||||
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
|
||||
|
||||
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
|
||||
stream.end()
|
||||
break
|
||||
else:
|
||||
stream.put(torch.tensor([next_token]))
|
||||
|
||||
|
||||
job_id = 0
|
||||
while True:
|
||||
try:
|
||||
# ---------- 让用户决定是否继续 ----------
|
||||
choice = input(
|
||||
"\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: "
|
||||
).strip().lower()
|
||||
choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower()
|
||||
if choice in {"q", "quit", "exit"}:
|
||||
print("收到退出指令,程序结束。")
|
||||
break
|
||||
@@ -466,15 +466,4 @@ while True:
|
||||
print(f"\n发生错误:{e}\n已终止对话 #{job_id},马上重启…")
|
||||
logger.error(f"Error in job {job_id}: {e}", exc_info=True)
|
||||
finally:
|
||||
job_id += 1 # 不管中断与否,都给下一任务换编号
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
job_id += 1 # 不管中断与否,都给下一任务换编号
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import os, sys
|
||||
import time
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
import logging
|
||||
@@ -188,7 +189,6 @@ def build_mla(layer_idx, json_config, gguf_weights):
|
||||
config.layer_idx = layer_idx
|
||||
config.pool = CPUInfer.backend_
|
||||
config.page_count = pages_count
|
||||
|
||||
|
||||
if q_a_type == "F32":
|
||||
mla = kt_kernel_ext.mla.MLA_F32(config)
|
||||
@@ -284,22 +284,21 @@ def build_moegate(layer_idx, json_config, gguf_weights):
|
||||
json_config["topk_group"],
|
||||
)
|
||||
|
||||
config.routed_scaling_factor = json_config['routed_scaling_factor']
|
||||
config.routed_scaling_factor = json_config["routed_scaling_factor"]
|
||||
|
||||
config.pool = CPUInfer.backend_
|
||||
|
||||
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
|
||||
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
|
||||
config.weight = weight.data_ptr()
|
||||
config.weight_type = type_to_ggml_type(weight_type)
|
||||
|
||||
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
|
||||
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
|
||||
config.e_score_correction_bias = bias.data_ptr()
|
||||
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
|
||||
|
||||
gate = kt_kernel_ext.gate.MoEGate(config)
|
||||
|
||||
|
||||
return gate
|
||||
|
||||
|
||||
|
||||
def build_llm(json_config, gguf_weights):
|
||||
@@ -312,15 +311,15 @@ def build_llm(json_config, gguf_weights):
|
||||
general_config.n_shared_experts = json_config["n_shared_experts"]
|
||||
general_config.max_qlen = max_qlen
|
||||
|
||||
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
|
||||
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
|
||||
general_config.lm_heads_ptr = lm_heads.data_ptr()
|
||||
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
|
||||
|
||||
output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
|
||||
general_config.norm_weights_ptr = output_norm.data_ptr()
|
||||
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
|
||||
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
|
||||
|
||||
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
|
||||
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
|
||||
general_config.token_embd_ptr = token_embd.data_ptr()
|
||||
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
|
||||
|
||||
@@ -330,12 +329,11 @@ def build_llm(json_config, gguf_weights):
|
||||
model = kt_kernel_ext.DeepseekV3Model(general_config)
|
||||
llm.model = model
|
||||
|
||||
|
||||
decoder_layers = []
|
||||
for i in range(json_config["num_hidden_layers"]):
|
||||
# for i in range(6):
|
||||
# for i in [0,1,2,3,4,5,6,7,8,9,10]:
|
||||
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
|
||||
# for i in range(6):
|
||||
# for i in [0,1,2,3,4,5,6,7,8,9,10]:
|
||||
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
|
||||
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
|
||||
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
|
||||
|
||||
@@ -351,11 +349,11 @@ def build_llm(json_config, gguf_weights):
|
||||
layer.ffn = build_ffn(i, json_config, gguf_weights)
|
||||
decoder_layers.append(layer)
|
||||
|
||||
model.layers = decoder_layers
|
||||
model.layers = decoder_layers
|
||||
return llm
|
||||
|
||||
|
||||
safetensor_path = '/home/bd/models/DeepSeek-R1'
|
||||
safetensor_path = "/home/bd/models/DeepSeek-R1"
|
||||
json_path = os.path.join(safetensor_path, "config.json")
|
||||
json_config = json.load(open(json_path, "r"))
|
||||
print(json_config)
|
||||
@@ -368,8 +366,8 @@ weights = dict(sorted(weights.items()))
|
||||
for name, t in weights.items():
|
||||
# if not name.startswith("blk"):
|
||||
# if name.startswith("blk.10."):
|
||||
# if "ffn_gate." in name:
|
||||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
# if "ffn_gate." in name:
|
||||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
print("Building LLM ...")
|
||||
llm = build_llm(json_config, weights)
|
||||
@@ -384,7 +382,7 @@ prompt_file = None
|
||||
force_think = False
|
||||
|
||||
|
||||
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
|
||||
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
|
||||
|
||||
|
||||
def start_chat():
|
||||
@@ -411,16 +409,14 @@ def start_chat():
|
||||
content = "Please write a piece of quicksort code in C++."
|
||||
elif os.path.isfile(content):
|
||||
content = open(content, "r").read()
|
||||
|
||||
|
||||
messages = [{"role": "user", "content": content}]
|
||||
input_tensor = tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, return_tensors="pt"
|
||||
)
|
||||
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
|
||||
if force_think:
|
||||
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
|
||||
input_tensor = torch.cat(
|
||||
[input_tensor, token_thinks], dim=1
|
||||
token_thinks = torch.tensor(
|
||||
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
|
||||
)
|
||||
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
|
||||
input_tensor = input_tensor.squeeze(0) # Add batch dimension
|
||||
|
||||
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
|
||||
@@ -431,28 +427,27 @@ def start_chat():
|
||||
qlens = [qlen]
|
||||
kvlens = [0]
|
||||
page_tables = [list(range(pages_count))]
|
||||
llm.forward(qlens,page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
|
||||
|
||||
llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
|
||||
|
||||
logits = output_logits[0]
|
||||
# print(logits)
|
||||
# sample
|
||||
# sample
|
||||
next_token = torch.argmax(logits).item()
|
||||
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
|
||||
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
|
||||
|
||||
|
||||
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
|
||||
|
||||
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
|
||||
print(stream.end(), end="", flush=True)
|
||||
break
|
||||
else:
|
||||
print(stream.put(torch.tensor([next_token])), end="", flush=True)
|
||||
|
||||
|
||||
job_id = 0
|
||||
while True:
|
||||
try:
|
||||
# ---------- 让用户决定是否继续 ----------
|
||||
choice = input(
|
||||
"\n【回车】开始对话 | 输入 q/quit/exit 退出程序: "
|
||||
).strip().lower()
|
||||
choice = input("\n【回车】开始对话 | 输入 q/quit/exit 退出程序: ").strip().lower()
|
||||
if choice in {"q", "quit", "exit"}:
|
||||
print("收到退出指令,程序结束。")
|
||||
break
|
||||
@@ -464,15 +459,4 @@ while True:
|
||||
# 随时 Ctrl-C:放弃当前任务并重启
|
||||
print(f"\n检测到 Ctrl-C,已终止对话 #{job_id},马上重启…")
|
||||
finally:
|
||||
job_id += 1 # 不管中断与否,都给下一任务换编号
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
job_id += 1 # 不管中断与否,都给下一任务换编号
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import os, sys
|
||||
import time
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
import logging
|
||||
@@ -20,12 +21,13 @@ from transformers import (
|
||||
logger = logging.getLogger("reader")
|
||||
|
||||
from gguf.gguf_reader import GGUFReader
|
||||
|
||||
# load_layers = 3
|
||||
load_layers = None
|
||||
worker_config = kt_kernel_ext.WorkerPoolConfig()
|
||||
worker_config.subpool_count = 2
|
||||
worker_config.subpool_numa_map= [0,1]
|
||||
worker_config.subpool_thread_count = [72,72]
|
||||
worker_config.subpool_numa_map = [0, 1]
|
||||
worker_config.subpool_thread_count = [72, 72]
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
|
||||
|
||||
max_qlen = 4096
|
||||
@@ -289,22 +291,21 @@ def build_moegate(layer_idx, json_config, gguf_weights):
|
||||
json_config["topk_group"],
|
||||
)
|
||||
|
||||
config.routed_scaling_factor = json_config['routed_scaling_factor']
|
||||
config.routed_scaling_factor = json_config["routed_scaling_factor"]
|
||||
|
||||
config.pool = CPUInfer.backend_
|
||||
|
||||
weight,weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
|
||||
weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
|
||||
config.weight = weight.data_ptr()
|
||||
config.weight_type = type_to_ggml_type(weight_type)
|
||||
|
||||
bias,bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
|
||||
bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
|
||||
config.e_score_correction_bias = bias.data_ptr()
|
||||
config.e_score_correction_bias_type = type_to_ggml_type(bias_type)
|
||||
|
||||
gate = kt_kernel_ext.gate.MoEGate(config)
|
||||
|
||||
|
||||
return gate
|
||||
|
||||
|
||||
|
||||
def build_llm(json_config, gguf_weights):
|
||||
@@ -317,15 +318,15 @@ def build_llm(json_config, gguf_weights):
|
||||
general_config.n_shared_experts = json_config["n_shared_experts"]
|
||||
general_config.max_qlen = max_qlen
|
||||
|
||||
lm_heads,lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
|
||||
lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
|
||||
general_config.lm_heads_ptr = lm_heads.data_ptr()
|
||||
general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)
|
||||
|
||||
output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
|
||||
general_config.norm_weights_ptr = output_norm.data_ptr()
|
||||
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
|
||||
general_config.norm_weights_type = type_to_ggml_type(output_norm_type)
|
||||
|
||||
token_embd,token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
|
||||
token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
|
||||
general_config.token_embd_ptr = token_embd.data_ptr()
|
||||
general_config.token_embd_type = type_to_ggml_type(token_embd_type)
|
||||
|
||||
@@ -335,13 +336,12 @@ def build_llm(json_config, gguf_weights):
|
||||
model = kt_kernel_ext.DeepseekV3Model(general_config)
|
||||
llm.model = model
|
||||
|
||||
|
||||
decoder_layers = []
|
||||
real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers
|
||||
|
||||
for i in range(real_load_layers):
|
||||
# for i in [2,3]:
|
||||
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config,i)
|
||||
# for i in [2,3]:
|
||||
layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
|
||||
attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
|
||||
ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")
|
||||
|
||||
@@ -357,11 +357,11 @@ def build_llm(json_config, gguf_weights):
|
||||
layer.ffn = build_ffn(i, json_config, gguf_weights)
|
||||
decoder_layers.append(layer)
|
||||
|
||||
model.layers = decoder_layers
|
||||
model.layers = decoder_layers
|
||||
return llm
|
||||
|
||||
|
||||
safetensor_path = '/home/bd/models/DeepSeek-R1'
|
||||
safetensor_path = "/home/bd/models/DeepSeek-R1"
|
||||
json_path = os.path.join(safetensor_path, "config.json")
|
||||
json_config = json.load(open(json_path, "r"))
|
||||
print(json_config)
|
||||
@@ -372,13 +372,13 @@ weights = dict(sorted(weights.items()))
|
||||
|
||||
|
||||
# for name, t in weights.items():
|
||||
# if not name.startswith("blk"):
|
||||
# if name.startswith("blk.10."):
|
||||
# if "ffn_gate." in name:
|
||||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
|
||||
print("Building LLM ...")
|
||||
# if not name.startswith("blk"):
|
||||
# if name.startswith("blk.10."):
|
||||
# if "ffn_gate." in name:
|
||||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
# print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
|
||||
|
||||
print("Building LLM ...")
|
||||
load_start_time = time.perf_counter()
|
||||
llm = build_llm(json_config, weights)
|
||||
load_end_time = time.perf_counter()
|
||||
@@ -395,22 +395,20 @@ config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)
|
||||
force_think = False
|
||||
|
||||
|
||||
output_logits = torch.zeros((max_qlen, json_config['vocab_size']), dtype=torch.float32)
|
||||
output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)
|
||||
|
||||
|
||||
def start_chat(content=None):
|
||||
if content is None:
|
||||
content = input("Chat: ")
|
||||
|
||||
|
||||
messages = [{"role": "user", "content": content}]
|
||||
input_tensor = tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, return_tensors="pt"
|
||||
)
|
||||
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
|
||||
if force_think:
|
||||
token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
|
||||
input_tensor = torch.cat(
|
||||
[input_tensor, token_thinks], dim=1
|
||||
token_thinks = torch.tensor(
|
||||
[tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
|
||||
)
|
||||
input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
|
||||
input_tensor = input_tensor.squeeze(0) # Add batch dimension
|
||||
|
||||
print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
|
||||
@@ -425,30 +423,32 @@ def start_chat(content=None):
|
||||
kvlens = [0]
|
||||
page_tables = [list(range(pages_count))]
|
||||
start_time = time.perf_counter()
|
||||
llm.forward(qlens,page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
|
||||
llm.forward(qlens, page_tables, kvlens, input_tensor.data_ptr(), output_logits.data_ptr())
|
||||
end_time = time.perf_counter()
|
||||
print(f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec")
|
||||
|
||||
print(
|
||||
f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
|
||||
)
|
||||
|
||||
logits = output_logits[0]
|
||||
# print(logits)
|
||||
# sample
|
||||
# sample
|
||||
next_token = torch.argmax(logits).item()
|
||||
# print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
|
||||
# kvlen = input_tensor.shape[0]
|
||||
input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)
|
||||
|
||||
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == '<|im_end|>':
|
||||
|
||||
if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
|
||||
stream.end()
|
||||
break
|
||||
else:
|
||||
stream.put(torch.tensor([next_token]))
|
||||
|
||||
|
||||
job_id = 0
|
||||
while True:
|
||||
try:
|
||||
# ---------- 让用户决定是否继续 ----------
|
||||
choice = input(
|
||||
"\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: "
|
||||
).strip().lower()
|
||||
choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序: ").strip().lower()
|
||||
if choice in {"q", "quit", "exit"}:
|
||||
print("收到退出指令,程序结束。")
|
||||
break
|
||||
@@ -472,15 +472,4 @@ while True:
|
||||
print(f"\n发生错误:{e}\n已终止对话 #{job_id},马上重启…")
|
||||
logger.error(f"Error in job {job_id}: {e}", exc_info=True)
|
||||
finally:
|
||||
job_id += 1 # 不管中断与否,都给下一任务换编号
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
job_id += 1 # 不管中断与否,都给下一任务换编号
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
import math
|
||||
import os,sys
|
||||
import os, sys
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
# from modeling_deepseek_v3 import MoEGate
|
||||
from configuration_deepseek_v3 import DeepseekV3Config
|
||||
|
||||
@@ -28,17 +30,20 @@ n_group = config.n_group
|
||||
topk_group = config.topk_group
|
||||
routed_scaling_factor = config.routed_scaling_factor
|
||||
|
||||
weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to('cpu').contiguous()
|
||||
bias = torch.randn((n_routed_experts,), dtype=torch.float32).to('cpu').contiguous()
|
||||
weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float32).to("cpu").contiguous()
|
||||
bias = torch.randn((n_routed_experts,), dtype=torch.float32).to("cpu").contiguous()
|
||||
|
||||
|
||||
# weights = torch.randn((n_routed_experts, hidden_size), dtype=torch.float16).to('cpu').contiguous ()
|
||||
def load_fp32_tensor(file_path, shape):
|
||||
return torch.zeros(shape, dtype=torch.float32).to('cpu').contiguous()
|
||||
with open(file_path, 'rb') as f:
|
||||
return torch.zeros(shape, dtype=torch.float32).to("cpu").contiguous()
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read()
|
||||
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
|
||||
tensor = tensor.view(shape) # 根据你的 shape reshape
|
||||
return tensor
|
||||
|
||||
|
||||
class MoEGate(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
@@ -54,13 +59,9 @@ class MoEGate(nn.Module):
|
||||
# topk selection algorithm
|
||||
self.norm_topk_prob = config.norm_topk_prob
|
||||
self.gating_dim = config.hidden_size
|
||||
self.weight = nn.Parameter(
|
||||
torch.empty((self.n_routed_experts, self.gating_dim))
|
||||
)
|
||||
self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
|
||||
if self.topk_method == "noaux_tc":
|
||||
self.e_score_correction_bias = nn.Parameter(
|
||||
torch.empty((self.n_routed_experts))
|
||||
)
|
||||
self.e_score_correction_bias = nn.Parameter(torch.empty((self.n_routed_experts)))
|
||||
self.reset_parameters()
|
||||
|
||||
def reset_parameters(self) -> None:
|
||||
@@ -73,93 +74,88 @@ class MoEGate(nn.Module):
|
||||
### compute gating score
|
||||
hidden_states = hidden_states.view(-1, h)
|
||||
|
||||
h_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input',(seq_len,h))
|
||||
h_to_check = load_fp32_tensor(
|
||||
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_input", (seq_len, h)
|
||||
)
|
||||
diff = (h_to_check - hidden_states).abs().max()
|
||||
# print("hidden_states diff:", diff)
|
||||
# assert diff<0.02
|
||||
|
||||
|
||||
bias_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias',(n_routed_experts))
|
||||
bias_to_check = load_fp32_tensor(
|
||||
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/bias", (n_routed_experts)
|
||||
)
|
||||
diff = (bias - bias_to_check).abs().max()
|
||||
# print('bias diff:',diff)
|
||||
# assert diff < 0.02
|
||||
|
||||
logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
|
||||
|
||||
logits = F.linear(
|
||||
hidden_states.type(torch.float32), self.weight.type(torch.float32), None
|
||||
logits_to_check = load_fp32_tensor(
|
||||
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits",
|
||||
(seq_len, n_routed_experts),
|
||||
)
|
||||
|
||||
logits_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits',(seq_len,n_routed_experts))
|
||||
diff = (logits_to_check - logits).abs().max()
|
||||
# print("logits diff:", diff)
|
||||
# assert diff < 0.02
|
||||
|
||||
|
||||
if self.scoring_func == "sigmoid":
|
||||
scores = logits.sigmoid()
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"insupportable scoring function for MoE gating: {self.scoring_func}"
|
||||
)
|
||||
raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")
|
||||
|
||||
### select top-k experts
|
||||
if self.topk_method == "noaux_tc":
|
||||
# assert not self.training
|
||||
scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
|
||||
|
||||
scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice',(seq_len,n_routed_experts))
|
||||
scores_to_check = load_fp32_tensor(
|
||||
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/scores_to_choice",
|
||||
(seq_len, n_routed_experts),
|
||||
)
|
||||
diff = (scores_for_choice - scores_to_check).abs().max()
|
||||
print(f'score for choice diff = {diff}')
|
||||
|
||||
print(f"score for choice diff = {diff}")
|
||||
|
||||
group_scores = (
|
||||
scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
|
||||
scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
|
||||
) # [n, n_group]
|
||||
|
||||
group_scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores',(seq_len,n_group))
|
||||
group_scores_to_check = load_fp32_tensor(
|
||||
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/group_scores",
|
||||
(seq_len, n_group),
|
||||
)
|
||||
diff = (group_scores - group_scores_to_check).abs().max()
|
||||
print(f'group scores diff = {diff}')
|
||||
print(f"group scores diff = {diff}")
|
||||
|
||||
|
||||
group_idx = torch.topk(
|
||||
group_scores, k=self.topk_group, dim=-1, sorted=False
|
||||
)[
|
||||
1
|
||||
] # [n, top_k_group]
|
||||
group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] # [n, top_k_group]
|
||||
group_mask = torch.zeros_like(group_scores) # [n, n_group]
|
||||
group_mask.scatter_(1, group_idx, 1) # [n, n_group]
|
||||
score_mask = (
|
||||
group_mask.unsqueeze(-1)
|
||||
.expand(
|
||||
bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
|
||||
)
|
||||
.expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
|
||||
.reshape(bsz * seq_len, -1)
|
||||
) # [n, e]
|
||||
tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e]
|
||||
tmp_scores_to_check = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped',(seq_len,n_routed_experts))
|
||||
is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True)
|
||||
print(f'tmp_score ok {is_close.all()}')
|
||||
|
||||
|
||||
_, topk_idx = torch.topk(
|
||||
tmp_scores, k=self.top_k, dim=-1, sorted=False
|
||||
tmp_scores_to_check = load_fp32_tensor(
|
||||
"/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug/gate_logits_toped",
|
||||
(seq_len, n_routed_experts),
|
||||
)
|
||||
is_close = torch.isclose(tmp_scores, tmp_scores_to_check, rtol=1e-2, atol=1e-2, equal_nan=True)
|
||||
print(f"tmp_score ok {is_close.all()}")
|
||||
|
||||
_, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
|
||||
topk_weight = scores.gather(1, topk_idx)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"insupportable TopK function for MoE gating: {self.topk_method}"
|
||||
)
|
||||
raise NotImplementedError(f"insupportable TopK function for MoE gating: {self.topk_method}")
|
||||
|
||||
### norm gate to sum 1
|
||||
if self.top_k > 1 and self.norm_topk_prob:
|
||||
denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
|
||||
topk_weight = topk_weight / denominator
|
||||
topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
|
||||
topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
|
||||
|
||||
return topk_idx, topk_weight
|
||||
|
||||
|
||||
|
||||
def torch_gate(hidden_states):
|
||||
hidden_states.unsqueeze_(0)
|
||||
gate = MoEGate(config)
|
||||
@@ -172,11 +168,11 @@ def torch_gate(hidden_states):
|
||||
|
||||
def cpuinfer_gate(hidden_states):
|
||||
config = kt_kernel_ext.gate.GateConfig(
|
||||
hidden_size,
|
||||
num_experts_per_token,
|
||||
n_routed_experts,
|
||||
n_group,
|
||||
topk_group,
|
||||
hidden_size,
|
||||
num_experts_per_token,
|
||||
n_routed_experts,
|
||||
n_group,
|
||||
topk_group,
|
||||
)
|
||||
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(64)
|
||||
@@ -188,32 +184,29 @@ def cpuinfer_gate(hidden_states):
|
||||
config.e_score_correction_bias = bias.data_ptr()
|
||||
config.e_score_correction_bias_type = ggml_type.FP32
|
||||
|
||||
gate = kt_kernel_ext.gate.MoEGate(config)
|
||||
gate = kt_kernel_ext.gate.MoEGate(config)
|
||||
|
||||
expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to("cpu").contiguous()
|
||||
expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to("cpu").contiguous()
|
||||
|
||||
|
||||
expert_ids = torch.zeros((seqlen, num_experts_per_token), dtype=torch.int64).to('cpu').contiguous()
|
||||
expert_weights = torch.zeros((seqlen, num_experts_per_token), dtype=torch.float32).to('cpu').contiguous()
|
||||
|
||||
gate.forward(seqlen,hidden_states.data_ptr(),expert_ids.data_ptr(), expert_weights.data_ptr())
|
||||
gate.forward(seqlen, hidden_states.data_ptr(), expert_ids.data_ptr(), expert_weights.data_ptr())
|
||||
|
||||
# print(expert_ids,expert_weights)
|
||||
return expert_ids, expert_weights
|
||||
|
||||
input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to('cpu').contiguous()
|
||||
|
||||
input = torch.randn(seqlen, hidden_size, dtype=torch.float32).to("cpu").contiguous()
|
||||
# print(input)
|
||||
ids,we = cpuinfer_gate(input)
|
||||
ids, we = cpuinfer_gate(input)
|
||||
idx = torch.argsort(ids, dim=-1, descending=True)
|
||||
ids = torch.gather(ids,dim=-1,index=idx)
|
||||
we = torch.gather(we,dim=-1,index=idx)
|
||||
ids = torch.gather(ids, dim=-1, index=idx)
|
||||
we = torch.gather(we, dim=-1, index=idx)
|
||||
|
||||
|
||||
|
||||
std_ids,std_we= torch_gate(input)
|
||||
std_ids, std_we = torch_gate(input)
|
||||
idx = torch.argsort(std_ids, dim=-1, descending=True)
|
||||
std_we = torch.gather(std_we,dim=-1,index=idx)
|
||||
std_ids = torch.gather(std_ids,dim=-1,index=idx)
|
||||
|
||||
std_we = torch.gather(std_we, dim=-1, index=idx)
|
||||
std_ids = torch.gather(std_ids, dim=-1, index=idx)
|
||||
|
||||
|
||||
# print("ids diff:", torch.abs(std_ids - ids).max())
|
||||
@@ -221,28 +214,3 @@ std_ids = torch.gather(std_ids,dim=-1,index=idx)
|
||||
assert torch.abs(std_ids - ids).max() == 0, "Expert IDs do not match!"
|
||||
assert torch.abs(std_we - we).max() < 1e-2, "Expert Weights do not match!"
|
||||
print("Expert IDs and Weights match successfully!")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from typing import Dict, Literal
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
|
||||
import torch
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
|
||||
torch.manual_seed(42)
|
||||
|
||||
@@ -132,6 +132,7 @@ def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: Literal[0, 1]
|
||||
|
||||
return packed
|
||||
|
||||
|
||||
def pack_tensor_per_row(q: torch.Tensor, num_bits: int) -> torch.Tensor:
|
||||
e, rows, cols = q.shape
|
||||
flat = q.view(e * rows, cols)
|
||||
@@ -283,9 +284,9 @@ def run_case(pattern: str) -> Dict[str, float]:
|
||||
CPUInfer.sync()
|
||||
|
||||
input_tensor_fp16 = input_tensor.to(torch.float16)
|
||||
t_output = moe_torch(
|
||||
input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16
|
||||
).to(torch.bfloat16)
|
||||
t_output = moe_torch(input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16).to(
|
||||
torch.bfloat16
|
||||
)
|
||||
|
||||
t_output = t_output.flatten()
|
||||
output = output.flatten()
|
||||
|
||||
@@ -11,7 +11,7 @@ import numpy as np
|
||||
# if REPO_ROOT not in sys.path:
|
||||
# sys.path.insert(0, REPO_ROOT)
|
||||
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext import CPUInfer
|
||||
|
||||
|
||||
@@ -57,10 +57,10 @@ def allocate_weights(expert_num, hidden_size, intermediate_size, group_size):
|
||||
def main():
|
||||
torch.manual_seed(123)
|
||||
|
||||
expert_num = 256 # Total experts
|
||||
expert_num = 256 # Total experts
|
||||
gpu_experts = expert_num # Number of experts on GPU
|
||||
gpu_tp_count = 2 # Number of TP parts
|
||||
|
||||
|
||||
num_experts_per_tok = 8
|
||||
hidden_size = 7168
|
||||
intermediate_size = 2048
|
||||
@@ -89,9 +89,7 @@ def main():
|
||||
|
||||
moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(cfg)
|
||||
|
||||
physical_to_logical_map = (
|
||||
torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
|
||||
)
|
||||
physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
|
||||
cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
|
||||
cpuinfer.sync()
|
||||
|
||||
@@ -169,6 +167,7 @@ def main():
|
||||
total_bytes = total_weights // group_size + total_weights // 2
|
||||
print(f"write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms")
|
||||
print(f"Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s")
|
||||
|
||||
def split_expert_tensor(tensor, chunk):
|
||||
"""Split tensor by experts"""
|
||||
return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)]
|
||||
@@ -229,10 +228,10 @@ def main():
|
||||
tp_scale_offset = col_scale_start + tp_idx * tp_slice_scale_size
|
||||
|
||||
down_weight_tp_parts.append(
|
||||
down_q_experts[expert_idx][tp_weight_offset:tp_weight_offset + tp_slice_weight_size]
|
||||
down_q_experts[expert_idx][tp_weight_offset : tp_weight_offset + tp_slice_weight_size]
|
||||
)
|
||||
down_scale_tp_parts.append(
|
||||
down_scale_experts[expert_idx][tp_scale_offset:tp_scale_offset + tp_slice_scale_size]
|
||||
down_scale_experts[expert_idx][tp_scale_offset : tp_scale_offset + tp_slice_scale_size]
|
||||
)
|
||||
|
||||
# Concatenate all column slices for this TP
|
||||
@@ -260,7 +259,9 @@ def main():
|
||||
assert torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight), f"w2 weight bytes mismatch for TP {tp_idx}"
|
||||
assert torch.allclose(w2_scale_bufs[tp_idx], expected_w2_scale), f"w2 scale values mismatch for TP {tp_idx}"
|
||||
|
||||
print(f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts")
|
||||
print(
|
||||
f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,26 +1,27 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Description :
|
||||
"""
|
||||
Description :
|
||||
Author : chenht2022
|
||||
Date : 2024-07-25 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-08-06 10:36:59
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys
|
||||
import time
|
||||
sys.path.append(os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
input_size = 16384
|
||||
output_size = 5120
|
||||
stride = 32
|
||||
group_max_len = 1024
|
||||
proj_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
hidden_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
proj_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
hidden_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
qlen = 30
|
||||
layer_num = 10
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(48)
|
||||
@@ -30,8 +31,10 @@ with torch.inference_mode(mode=True):
|
||||
linears = []
|
||||
projs = []
|
||||
for _ in range(layer_num):
|
||||
proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
|
||||
config = kt_kernel_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
|
||||
proj = torch.randn((output_size, input_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
|
||||
config = kt_kernel_ext.linear.LinearConfig(
|
||||
input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type
|
||||
)
|
||||
linear = kt_kernel_ext.linear.Linear(config)
|
||||
projs.append(proj)
|
||||
linears.append(linear)
|
||||
@@ -43,20 +46,14 @@ with torch.inference_mode(mode=True):
|
||||
output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
|
||||
CPUInfer.submit(
|
||||
linear.forward(
|
||||
qlen,
|
||||
input.data_ptr(),
|
||||
output.data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.submit(linear.forward(qlen, input.data_ptr(), output.data_ptr()))
|
||||
CPUInfer.sync()
|
||||
# print('cpuinfer output', output)
|
||||
|
||||
proj = projs[i%layer_num]
|
||||
proj = projs[i % layer_num]
|
||||
t_output = torch.mm(input, proj.t())
|
||||
# print('torch output', t_output)
|
||||
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print('diff = ', diff)
|
||||
assert(diff < 0.001)
|
||||
print("diff = ", diff)
|
||||
assert diff < 0.001
|
||||
|
||||
@@ -1,19 +1,22 @@
|
||||
import logging
|
||||
import os,sys
|
||||
import os, sys
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
from torch import inf, nn
|
||||
from torch.nn import init
|
||||
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
|
||||
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
|
||||
|
||||
logger = logging.getLogger("reader")
|
||||
|
||||
from gguf.gguf_reader import GGUFReader
|
||||
|
||||
|
||||
def read_gguf_file(gguf_file_path):
|
||||
"""
|
||||
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
|
||||
@@ -46,12 +49,15 @@ def read_gguf_file(gguf_file_path):
|
||||
re.append(tensor)
|
||||
return re
|
||||
|
||||
|
||||
def get_torch_tensor_from_gguf(gguf_weights, name):
|
||||
return torch.from_numpy(gguf_weights[name].data).contiguous()
|
||||
|
||||
|
||||
def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
|
||||
return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name
|
||||
|
||||
|
||||
def type_to_ggml_type(type):
|
||||
if type == "F32":
|
||||
return ggml_type.FP32
|
||||
@@ -70,12 +76,12 @@ seed = 42 # 你可以选择任何整数作为种子
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
qlen = 3212
|
||||
qlen = 3212
|
||||
kvlen = 0
|
||||
|
||||
|
||||
page_table = range(20)
|
||||
bsz_tensors=torch.tensor([1])
|
||||
bsz_tensors = torch.tensor([1])
|
||||
|
||||
|
||||
page_size = 256
|
||||
@@ -94,8 +100,7 @@ rope_theta = 10000
|
||||
max_qlen = 4096
|
||||
max_kvlen = 4096
|
||||
|
||||
max_position_embeddings = 163840
|
||||
|
||||
max_position_embeddings = 163840
|
||||
|
||||
|
||||
rope_scaling = {
|
||||
@@ -105,11 +110,10 @@ rope_scaling = {
|
||||
"mscale": 1.0,
|
||||
"mscale_all_dim": 1.0,
|
||||
"original_max_position_embeddings": 4096,
|
||||
"type": "yarn"
|
||||
"type": "yarn",
|
||||
}
|
||||
|
||||
|
||||
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(30)
|
||||
validation_iter = 100
|
||||
|
||||
@@ -119,15 +123,16 @@ weight_type = torch.bfloat16
|
||||
# weight_type = torch.float16
|
||||
|
||||
|
||||
input_type = {torch.float32:torch.float32,
|
||||
torch.float16:torch.float16,
|
||||
torch.bfloat16:torch.float32,
|
||||
}[weight_type]
|
||||
input_type = {
|
||||
torch.float32: torch.float32,
|
||||
torch.float16: torch.float16,
|
||||
torch.bfloat16: torch.float32,
|
||||
}[weight_type]
|
||||
|
||||
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type)
|
||||
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=weight_type)
|
||||
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type)
|
||||
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type)
|
||||
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=weight_type)
|
||||
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type)
|
||||
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type)
|
||||
q_a_norm = torch.ones(hidden_size, dtype=torch.float32)
|
||||
kv_a_norm = torch.ones(hidden_size, dtype=torch.float32)
|
||||
@@ -190,7 +195,7 @@ if use_real_weights := True:
|
||||
|
||||
o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
|
||||
o_proj.weight = nn.Parameter(o_proj_weight.view(torch.bfloat16), requires_grad=False)
|
||||
|
||||
|
||||
else:
|
||||
init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
|
||||
init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
|
||||
@@ -203,16 +208,16 @@ q_absorb = x_reshaped[:, 0]
|
||||
out_absorb = x_reshaped[:, 1]
|
||||
|
||||
|
||||
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
|
||||
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
|
||||
|
||||
|
||||
def test_cpu_mla():
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
q_a_proj_weight = q_a_proj.weight.to(weight_type).to('cpu').contiguous()
|
||||
q_b_proj_weight = q_b_proj.weight.to(weight_type).to('cpu').contiguous()
|
||||
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(weight_type).contiguous()
|
||||
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to('cpu').contiguous()
|
||||
o_proj_weight = o_proj.weight.to(weight_type).to('cpu').contiguous()
|
||||
q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous()
|
||||
q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous()
|
||||
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous()
|
||||
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous()
|
||||
o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous()
|
||||
|
||||
config = kt_kernel_ext.mla.MLAConfig(
|
||||
hidden_size,
|
||||
@@ -224,7 +229,7 @@ def test_cpu_mla():
|
||||
)
|
||||
config.max_qlen = max_qlen
|
||||
config.max_kvlen = max_kvlen
|
||||
config.max_position_embeddings = max_position_embeddings
|
||||
config.max_position_embeddings = max_position_embeddings
|
||||
config.rope_scaling_factor = rope_scaling["factor"]
|
||||
config.rope_theta = rope_theta
|
||||
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
|
||||
@@ -245,7 +250,6 @@ def test_cpu_mla():
|
||||
config.kv_a_norm_type = ggml_type.FP32
|
||||
config.page_count = pages_count
|
||||
|
||||
|
||||
if weight_type == torch.float32:
|
||||
config.q_a_proj_type = ggml_type.FP32
|
||||
config.q_b_proj_type = ggml_type.FP32
|
||||
@@ -267,10 +271,8 @@ def test_cpu_mla():
|
||||
else:
|
||||
raise ValueError(f"Unsupported data type: {weight_type}")
|
||||
|
||||
|
||||
config.pool = CPUInfer.backend_
|
||||
|
||||
|
||||
if weight_type == torch.float32:
|
||||
mla = kt_kernel_ext.mla.MLA_F32(config)
|
||||
elif weight_type == torch.float16:
|
||||
@@ -280,54 +282,53 @@ def test_cpu_mla():
|
||||
mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported data type: {weight_type}")
|
||||
|
||||
|
||||
mla.load_weights()
|
||||
mla.set_local_pages(pages_count)
|
||||
|
||||
output = torch.zeros((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
|
||||
mla.forward([qlen],[page_table],[kvlen],hidden_states.data_ptr(),output.data_ptr())
|
||||
print("CPU MLA Output: ",output)
|
||||
output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
|
||||
mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr())
|
||||
print("CPU MLA Output: ", output)
|
||||
return output
|
||||
|
||||
|
||||
|
||||
|
||||
def load_fp16_tensor(file_path, shape):
|
||||
# return load_fp32_tensor(file_path, shape)
|
||||
return torch.zeros(shape)
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read()
|
||||
tensor = torch.frombuffer(raw_data, dtype=weight_type)
|
||||
tensor = tensor.view(shape) # 根据你的 shape reshape
|
||||
return tensor
|
||||
|
||||
|
||||
def load_fp32_tensor(file_path, shape):
|
||||
return torch.zeros(shape)
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read()
|
||||
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
|
||||
tensor = tensor.view(shape) # 根据你的 shape reshape
|
||||
return tensor
|
||||
|
||||
|
||||
def test_torch():
|
||||
torch.set_grad_enabled(False)
|
||||
|
||||
softmax_scale = (nope_size + rope_size) ** -0.5
|
||||
# 1代表的是压缩的kv的头数
|
||||
k_caches = torch.randn(1,pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type)
|
||||
k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(weight_type)
|
||||
kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)
|
||||
|
||||
q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)
|
||||
q_a_layernorm.weight = nn.Parameter( q_a_norm,requires_grad=False)
|
||||
q_a_layernorm.weight = nn.Parameter(q_a_norm, requires_grad=False)
|
||||
|
||||
x = torch.randn(q_lora_rank, dtype=weight_type)*100
|
||||
x = torch.randn(q_lora_rank, dtype=weight_type) * 100
|
||||
print(x)
|
||||
print(q_a_layernorm(x))
|
||||
|
||||
kv_a_layernorm = DeepseekV2RMSNorm(kv_lora_rank)
|
||||
kv_a_layernorm.weight = nn.Parameter(kv_a_norm, requires_grad=False)
|
||||
|
||||
|
||||
# 第三步:拆分成两个 tensor
|
||||
# q_absorb, out_absorb = x_permuted[:, 0], x_permuted[:, 1] # 都是 (num_heads, nope_size, kv_lora_rank
|
||||
# q_absorb = kv_b_proj[:, ] # torch.randn(num_heads, nope_size, kv_lora_rank, dtype=data_type)
|
||||
@@ -348,65 +349,64 @@ def test_torch():
|
||||
# kv_indices 是[0:bsz],page_idx=[0:bsz], page_offset=[kvlen:qlen+kvlen]
|
||||
# last_page_len = [qlen+kvlen,...] layer_idx = 1
|
||||
# position_ids = [kvlen:qlen+kvlen]
|
||||
q_indptr = torch.tensor([0,qlen]).to(torch.int32)
|
||||
q_indptr = torch.tensor([0, qlen]).to(torch.int32)
|
||||
|
||||
kv_indptr = torch.tensor([0,(qlen+kvlen+page_size-1)//page_size]).to(torch.int32)
|
||||
kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32)
|
||||
kv_indices = torch.tensor(range(pages_count)).to(torch.int32)
|
||||
|
||||
page_idx = torch.tensor([i//page_size for i in range(kvlen,kvlen+qlen)] ).to(torch.int32)
|
||||
page_offset = torch.tensor( [i%page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
|
||||
page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
|
||||
page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
|
||||
|
||||
last_page_len = torch.tensor([256], device=hidden_states.device)
|
||||
position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32)
|
||||
|
||||
|
||||
# 按照行创建 mask [qlen,kvlen+qlen]
|
||||
attention_masks = torch.zeros((max_qlen, max_kvlen), dtype=weight_type)
|
||||
for i in range(max_qlen):
|
||||
attention_masks[i, i + kvlen + 1:] = -inf
|
||||
attention_masks[i, i + kvlen + 1 :] = -inf
|
||||
|
||||
|
||||
def torch_attn(hidden_states_i: torch.Tensor,
|
||||
kv_cache: KDeepSeekV3Cache,
|
||||
position_ids: torch.Tensor,
|
||||
page_idx: torch.Tensor,
|
||||
page_offset: torch.Tensor,
|
||||
attention_masks: Optional[list[torch.Tensor]] = None,
|
||||
q_indptr: Optional[torch.Tensor] = None,
|
||||
kv_indices: Optional[torch.Tensor] = None,
|
||||
kv_indptr: Optional[torch.Tensor] = None,
|
||||
bsz_tensors: Optional[torch.Tensor] = None,
|
||||
last_page_len: Optional[torch.Tensor] = None,
|
||||
layer_idx: Optional[int] = None,
|
||||
):
|
||||
def torch_attn(
|
||||
hidden_states_i: torch.Tensor,
|
||||
kv_cache: KDeepSeekV3Cache,
|
||||
position_ids: torch.Tensor,
|
||||
page_idx: torch.Tensor,
|
||||
page_offset: torch.Tensor,
|
||||
attention_masks: Optional[list[torch.Tensor]] = None,
|
||||
q_indptr: Optional[torch.Tensor] = None,
|
||||
kv_indices: Optional[torch.Tensor] = None,
|
||||
kv_indptr: Optional[torch.Tensor] = None,
|
||||
bsz_tensors: Optional[torch.Tensor] = None,
|
||||
last_page_len: Optional[torch.Tensor] = None,
|
||||
layer_idx: Optional[int] = None,
|
||||
):
|
||||
global out_absorb
|
||||
global q_absorb
|
||||
hidden_states = hidden_states_i.to(weight_type)
|
||||
# range bsz_tensors
|
||||
final_attention_output = torch.tensor([], device=hidden_states.device)
|
||||
for i in range(bsz_tensors[0]):
|
||||
batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
|
||||
batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i]
|
||||
batch_last_page_len = last_page_len[i]
|
||||
# kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
|
||||
batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]]
|
||||
batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]]
|
||||
# kv_page_nums is the number of pages for the current batch
|
||||
kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
|
||||
kv_page_nums = kv_indptr[i + 1] - kv_indptr[i]
|
||||
# kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
|
||||
kv_total_len = kv_page_nums * page_size
|
||||
if batch_last_page_len is not None:
|
||||
kv_total_len = kv_total_len - (page_size - batch_last_page_len)
|
||||
# print(f"kv_total_len's shape {kv_total_len.shape}")
|
||||
# kv_index is the index of the kv cache pages for the current batch
|
||||
kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
|
||||
kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]]
|
||||
# we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
|
||||
# from q_indptr[i] to q_indptr[i+1] is the range of the current batch
|
||||
batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]]
|
||||
batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]]
|
||||
qlen, _ = batch_hidden_states.size()
|
||||
# print("qlen -> ", qlen)
|
||||
|
||||
hidden_states_to_check = load_fp16_tensor('./debug/query_0_tp_0_input.bin',batch_hidden_states.shape)
|
||||
hidden_states_to_check = load_fp16_tensor("./debug/query_0_tp_0_input.bin", batch_hidden_states.shape)
|
||||
diff = torch.abs(batch_hidden_states - hidden_states_to_check).max()
|
||||
print("hidden_states diff -> ", diff)
|
||||
|
||||
@@ -422,8 +422,6 @@ def test_torch():
|
||||
# print("q_lora mae -> ", mae)
|
||||
# print("q_lora mae test -> ", mae_test)
|
||||
|
||||
|
||||
|
||||
q_lora_norm = q_a_layernorm(q_lora)
|
||||
# q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
|
||||
# q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
|
||||
@@ -435,30 +433,25 @@ def test_torch():
|
||||
# print("q_lora_norm mae -> ", mae)
|
||||
# print("q_lora_norm diff test -> ", diff_test)
|
||||
# print("q_lora_norm mae test -> ", mae_test)
|
||||
|
||||
|
||||
q = q_b_proj(q_lora_norm)
|
||||
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
|
||||
q = q.view(qlen, num_heads, nope_size+rope_size)
|
||||
q = q.view(qlen, num_heads, nope_size + rope_size)
|
||||
# q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
|
||||
# q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
|
||||
q_nope, q_pe = torch.split(
|
||||
q, [nope_size, rope_size], dim=-1
|
||||
)
|
||||
|
||||
q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1)
|
||||
|
||||
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
|
||||
compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
|
||||
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
|
||||
compressed_kv, k_pe = torch.split(
|
||||
compressed_kv, [kv_lora_rank, rope_size], dim=-1
|
||||
)
|
||||
compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1)
|
||||
compressed_kv = compressed_kv.contiguous()
|
||||
|
||||
|
||||
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
|
||||
# compressed_kv_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_kv_lora_rank',
|
||||
# compressed_kv_page_0.shape)
|
||||
# diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
|
||||
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
|
||||
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
|
||||
# print("compressed_kv diff -> ", diff)
|
||||
# print("compressed_kv mae -> ", mae)
|
||||
|
||||
@@ -472,14 +465,11 @@ def test_torch():
|
||||
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
|
||||
# print("compressed_kv diff norm -> ", diff)
|
||||
# print("compressed_kv mae norm -> ", mae)
|
||||
|
||||
|
||||
|
||||
|
||||
k_pe = k_pe.view(qlen, 1, rope_size)
|
||||
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
|
||||
compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
|
||||
|
||||
|
||||
cos, sin = rotary_emb(q_pe, batch_position_ids)
|
||||
|
||||
# q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
|
||||
@@ -494,8 +484,8 @@ def test_torch():
|
||||
# print("q_nope[0] mae -> ", mae)
|
||||
# print("q_nope[0] diff test -> ", diff_test)
|
||||
# print("q_nope[0] mae test -> ", mae_test)
|
||||
|
||||
q_pe_nope = q_pe.transpose(0,1)
|
||||
|
||||
q_pe_nope = q_pe.transpose(0, 1)
|
||||
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe_nope[0].shape)
|
||||
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope', q_pe_nope[0].shape)
|
||||
# q_pe_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope_test', q_pe_nope[0].shape)
|
||||
@@ -534,12 +524,11 @@ def test_torch():
|
||||
q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
|
||||
q_pe = q_pe.squeeze(0)
|
||||
# q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
|
||||
q_pe.transpose_(0, 1)
|
||||
q_pe.transpose_(0, 1)
|
||||
|
||||
# diff = torch.abs(q_pe - q_new).max()
|
||||
# print("q_pe diff -> ", diff)
|
||||
|
||||
|
||||
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
|
||||
# diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
|
||||
# mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
|
||||
@@ -552,15 +541,22 @@ def test_torch():
|
||||
# print("q_pe[0] 2 mae -> ", mae)
|
||||
|
||||
if kv_cache is not None:
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models
|
||||
compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
|
||||
compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
|
||||
k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
|
||||
cache_kwargs = {
|
||||
"sin": sin,
|
||||
"cos": cos,
|
||||
"page_idx": batch_page_idx,
|
||||
"page_offset": batch_page_offset,
|
||||
} # Specific to RoPE models
|
||||
compressed_kv_with_k_pe = kv_cache.update(
|
||||
compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs
|
||||
)
|
||||
compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
|
||||
k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
|
||||
# q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
|
||||
# out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
|
||||
# q_absorb, out_absorb = get_absorbed()
|
||||
# q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)]
|
||||
q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
|
||||
q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
|
||||
|
||||
# q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope[0].shape)
|
||||
# diff = torch.abs(q_nope[0] - q_nope_0_to_check).max()
|
||||
@@ -568,7 +564,7 @@ def test_torch():
|
||||
# print("q_nope[0] diff -> ", diff)
|
||||
|
||||
# q_nope is [num_heads(128), qlen, kv_lora_rank(512)]
|
||||
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
|
||||
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
|
||||
|
||||
# k_b_proj_check = load_fp16_tensor('./debug/query_0_tp_0_k_b_lora', (nope_size,kv_lora_rank))
|
||||
# diff = torch.abs(q_absorb[0] - k_b_proj_check).max()
|
||||
@@ -594,7 +590,7 @@ def test_torch():
|
||||
if batch_compressed_kv is None or batch_k_pe is None:
|
||||
batch_compressed_kv = tmp_compressed_kv
|
||||
batch_k_pe = tmp_k_pe
|
||||
else:
|
||||
else:
|
||||
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
||||
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
||||
kv_total_len -= page_size
|
||||
@@ -604,28 +600,27 @@ def test_torch():
|
||||
if batch_compressed_kv is None or batch_k_pe is None:
|
||||
batch_compressed_kv = tmp_compressed_kv
|
||||
batch_k_pe = tmp_k_pe
|
||||
else:
|
||||
else:
|
||||
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
||||
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
||||
break
|
||||
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
|
||||
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
|
||||
|
||||
|
||||
# k_pe_to_check = load_fp16_tensor('./debug/query_0_tp_0_page_0_k_rope', (256,64))
|
||||
# diff = torch.abs(batch_k_pe[:256] - k_pe_to_check).max()
|
||||
# mae = torch.mean(torch.abs(batch_k_pe[:256] - k_pe_to_check))
|
||||
# print("k_pe diff -> ", diff)
|
||||
# print("k_pe mae -> ", mae)
|
||||
|
||||
pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
|
||||
pe_weights = torch.matmul(q_pe, batch_k_pe.mT)
|
||||
kv_total_len = kv_page_nums * page_size
|
||||
# pe_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_pe_attention_weights', (1024,4096))
|
||||
# pe_weights_0 = pe_weights_0[0:qlen, 0:kv_total_len]
|
||||
# diff = torch.abs(pe_weights[0] - pe_weights_0).max()
|
||||
# print("pe_weights[0] diff -> ", diff)
|
||||
|
||||
attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))
|
||||
attention_weights = pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)
|
||||
|
||||
# raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096))
|
||||
# raw_weights = raw_weights[0:qlen, 0:kv_total_len]
|
||||
@@ -634,47 +629,47 @@ def test_torch():
|
||||
|
||||
attention_weights = attention_weights * softmax_scale
|
||||
# attention_weights is [num_heads(128), qlen, k_len]
|
||||
|
||||
|
||||
# attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)
|
||||
|
||||
|
||||
# attention_masks[i] is [qlen, k_len]
|
||||
|
||||
|
||||
print(attention_weights.shape)
|
||||
print(attention_masks.shape)
|
||||
attention_weights = (attention_weights + attention_masks[ :attention_weights.shape[1],:attention_weights.shape[2]])
|
||||
attention_weights = (
|
||||
attention_weights + attention_masks[: attention_weights.shape[1], : attention_weights.shape[2]]
|
||||
)
|
||||
# attention_weights shape is [num_heads(128), qlen, k_len]
|
||||
|
||||
|
||||
attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype)
|
||||
attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=weight_type).to(q_pe.dtype)
|
||||
|
||||
# attention_weights_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_weights', (1024, 4096))
|
||||
# attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
|
||||
# diff = torch.abs(attention_weights[0] - attention_weights_0).max()
|
||||
# print("attention_weights[0] diff -> ", diff)
|
||||
|
||||
|
||||
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
|
||||
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
|
||||
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
|
||||
|
||||
# o_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_o_absorb', (qlen,kv_lora_rank))
|
||||
# diff = torch.abs(attn_output[0] - o_absorb_check).max()
|
||||
# print("o absorb[0] diff -> ", diff)
|
||||
|
||||
out_absorb = out_absorb.transpose(1, 2) # [qlen, num_heads(128), v_head_dim(128)]
|
||||
out_absorb = out_absorb.transpose(1, 2) # [qlen, num_heads(128), v_head_dim(128)]
|
||||
# q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
|
||||
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
|
||||
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
|
||||
|
||||
# attn_output_check_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_output', (qlen, nope_size))
|
||||
# diff = torch.abs(attn_output[0] - attn_output_check_0).max()
|
||||
# print("attn_output[0] diff -> ", diff)
|
||||
|
||||
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
|
||||
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
|
||||
attn_output = attn_output.reshape(qlen, num_heads * nope_size)
|
||||
|
||||
w_o = o_proj.weight.view([hidden_size,num_heads * nope_size])
|
||||
output = torch.matmul(attn_output,w_o.transpose(0,1))
|
||||
w_o = o_proj.weight.view([hidden_size, num_heads * nope_size])
|
||||
output = torch.matmul(attn_output, w_o.transpose(0, 1))
|
||||
output = output.view(qlen, hidden_size)
|
||||
|
||||
|
||||
# output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size))
|
||||
# h1_o = w_o[:,:128]
|
||||
# local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128))
|
||||
@@ -685,35 +680,32 @@ def test_torch():
|
||||
# diff = torch.abs(h1_output - output_0_check).max()
|
||||
# print("h1_output diff -> ", diff)
|
||||
|
||||
|
||||
# output_check = load_fp16_tensor('./debug/output.bin', output.shape)
|
||||
# diff = torch.abs(output - output_check).max()
|
||||
# mae = torch.mean(torch.abs(output - output_check))
|
||||
# print("output diff -> ", diff)
|
||||
|
||||
|
||||
final_attention_output = torch.cat((final_attention_output, output), dim=0)
|
||||
return final_attention_output
|
||||
|
||||
|
||||
|
||||
torch_output = torch_attn(
|
||||
hidden_states,
|
||||
kv_cache,
|
||||
position_ids,
|
||||
page_idx,
|
||||
page_offset,
|
||||
attention_masks=attention_masks,
|
||||
q_indptr=q_indptr,
|
||||
kv_indices=kv_indices,
|
||||
kv_indptr=kv_indptr,
|
||||
bsz_tensors=bsz_tensors,
|
||||
last_page_len=last_page_len,
|
||||
layer_idx=0
|
||||
)
|
||||
print("Torch Output: ",torch_output)
|
||||
hidden_states,
|
||||
kv_cache,
|
||||
position_ids,
|
||||
page_idx,
|
||||
page_offset,
|
||||
attention_masks=attention_masks,
|
||||
q_indptr=q_indptr,
|
||||
kv_indices=kv_indices,
|
||||
kv_indptr=kv_indptr,
|
||||
bsz_tensors=bsz_tensors,
|
||||
last_page_len=last_page_len,
|
||||
layer_idx=0,
|
||||
)
|
||||
print("Torch Output: ", torch_output)
|
||||
return torch_output
|
||||
|
||||
|
||||
torch.set_printoptions(sci_mode=False, precision=5)
|
||||
output_cpu = test_cpu_mla()
|
||||
output_torch = test_torch()
|
||||
@@ -724,11 +716,9 @@ diff = (output_cpu - output_torch).abs()
|
||||
diff_relative = diff / (output_cpu.abs())
|
||||
# 把 diff_relative 中的 NaN 替换为 0
|
||||
diff_relative = torch.where(torch.isnan(diff_relative), torch.zeros_like(diff_relative), diff_relative)
|
||||
diff_relative_mean = torch.mean(torch.abs(output_cpu-output_torch)) / torch.mean(torch.abs(output_torch))
|
||||
diff_relative_mean = torch.mean(torch.abs(output_cpu - output_torch)) / torch.mean(torch.abs(output_torch))
|
||||
|
||||
print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}')
|
||||
print(
|
||||
f"Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}"
|
||||
)
|
||||
assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,19 +1,22 @@
|
||||
import logging
|
||||
import os,sys
|
||||
import os, sys
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
from torch import inf, nn
|
||||
from torch.nn import init
|
||||
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
|
||||
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
|
||||
|
||||
logger = logging.getLogger("reader")
|
||||
|
||||
from gguf.gguf_reader import GGUFReader
|
||||
|
||||
|
||||
def read_gguf_file(gguf_file_path):
|
||||
"""
|
||||
Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
|
||||
@@ -46,12 +49,15 @@ def read_gguf_file(gguf_file_path):
|
||||
re.append(tensor)
|
||||
return re
|
||||
|
||||
|
||||
def get_torch_tensor_from_gguf(gguf_weights, name):
|
||||
return torch.from_numpy(gguf_weights[name].data).contiguous()
|
||||
|
||||
|
||||
def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
|
||||
return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name
|
||||
|
||||
|
||||
def type_to_ggml_type(type):
|
||||
if type == "F32":
|
||||
return ggml_type.FP32
|
||||
@@ -75,7 +81,7 @@ kvlen = 0
|
||||
|
||||
|
||||
page_table = range(20)
|
||||
bsz_tensors=torch.tensor([1])
|
||||
bsz_tensors = torch.tensor([1])
|
||||
|
||||
|
||||
page_size = 256
|
||||
@@ -94,8 +100,7 @@ rope_theta = 10000
|
||||
max_qlen = 1024
|
||||
max_kvlen = 4096
|
||||
|
||||
max_position_embeddings = 163840
|
||||
|
||||
max_position_embeddings = 163840
|
||||
|
||||
|
||||
rope_scaling = {
|
||||
@@ -105,11 +110,10 @@ rope_scaling = {
|
||||
"mscale": 1.0,
|
||||
"mscale_all_dim": 1.0,
|
||||
"original_max_position_embeddings": 4096,
|
||||
"type": "yarn"
|
||||
"type": "yarn",
|
||||
}
|
||||
|
||||
|
||||
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(64)
|
||||
validation_iter = 100
|
||||
|
||||
@@ -119,15 +123,16 @@ weight_type = torch.bfloat16
|
||||
# weight_type = torch.float16
|
||||
|
||||
|
||||
input_type = {torch.float32:torch.float32,
|
||||
torch.float16:torch.float16,
|
||||
torch.bfloat16:torch.float32,
|
||||
}[weight_type]
|
||||
input_type = {
|
||||
torch.float32: torch.float32,
|
||||
torch.float16: torch.float16,
|
||||
torch.bfloat16: torch.float32,
|
||||
}[weight_type]
|
||||
|
||||
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=weight_type)
|
||||
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=weight_type)
|
||||
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=weight_type)
|
||||
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=weight_type)
|
||||
kv_b_proj = nn.Linear( num_heads * (nope_size + nope_size),kv_lora_rank, bias=False, dtype=weight_type)
|
||||
kv_b_proj = nn.Linear(num_heads * (nope_size + nope_size), kv_lora_rank, bias=False, dtype=weight_type)
|
||||
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=weight_type)
|
||||
q_a_norm = torch.ones(hidden_size, dtype=torch.float32)
|
||||
kv_a_norm = torch.ones(hidden_size, dtype=torch.float32)
|
||||
@@ -190,7 +195,7 @@ if use_real_weights := True:
|
||||
|
||||
o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
|
||||
o_proj.weight = nn.Parameter(o_proj_weight.view(torch.bfloat16), requires_grad=False)
|
||||
|
||||
|
||||
else:
|
||||
init.normal_(q_a_proj.weight, mean=0.0, std=0.02)
|
||||
init.normal_(q_b_proj.weight, mean=0.0, std=0.02)
|
||||
@@ -203,16 +208,16 @@ q_absorb = x_reshaped[:, 0]
|
||||
out_absorb = x_reshaped[:, 1]
|
||||
|
||||
|
||||
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
|
||||
hidden_states = torch.randn((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
|
||||
|
||||
|
||||
def build_mla():
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
q_a_proj_weight = q_a_proj.weight.to(weight_type).to('cpu').contiguous()
|
||||
q_b_proj_weight = q_b_proj.weight.to(weight_type).to('cpu').contiguous()
|
||||
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(weight_type).contiguous()
|
||||
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to('cpu').contiguous()
|
||||
o_proj_weight = o_proj.weight.to(weight_type).to('cpu').contiguous()
|
||||
q_a_proj_weight = q_a_proj.weight.to(weight_type).to("cpu").contiguous()
|
||||
q_b_proj_weight = q_b_proj.weight.to(weight_type).to("cpu").contiguous()
|
||||
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(weight_type).contiguous()
|
||||
kv_b_proj_weight = kv_b_proj.weight.to(weight_type).to("cpu").contiguous()
|
||||
o_proj_weight = o_proj.weight.to(weight_type).to("cpu").contiguous()
|
||||
|
||||
config = kt_kernel_ext.mla.MLAConfig(
|
||||
hidden_size,
|
||||
@@ -224,7 +229,7 @@ def build_mla():
|
||||
)
|
||||
config.max_qlen = max_qlen
|
||||
config.max_kvlen = max_kvlen
|
||||
config.max_position_embeddings = max_position_embeddings
|
||||
config.max_position_embeddings = max_position_embeddings
|
||||
config.rope_scaling_factor = rope_scaling["factor"]
|
||||
config.rope_theta = rope_theta
|
||||
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
|
||||
@@ -244,7 +249,6 @@ def build_mla():
|
||||
config.kv_a_norm = kv_a_norm.data_ptr()
|
||||
config.kv_a_norm_type = ggml_type.FP32
|
||||
|
||||
|
||||
if weight_type == torch.float32:
|
||||
config.q_a_proj_type = ggml_type.FP32
|
||||
config.q_b_proj_type = ggml_type.FP32
|
||||
@@ -266,10 +270,8 @@ def build_mla():
|
||||
else:
|
||||
raise ValueError(f"Unsupported data type: {weight_type}")
|
||||
|
||||
|
||||
config.pool = CPUInfer.backend_
|
||||
|
||||
|
||||
if weight_type == torch.float32:
|
||||
mla = kt_kernel_ext.mla.MLA_F32(config)
|
||||
elif weight_type == torch.float16:
|
||||
@@ -278,25 +280,20 @@ def build_mla():
|
||||
mla = kt_kernel_ext.mla.MLA_F32(config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported data type: {weight_type}")
|
||||
|
||||
|
||||
mla.load_weights()
|
||||
mla.set_local_pages(pages_count)
|
||||
return mla
|
||||
|
||||
|
||||
|
||||
|
||||
def load_fp32_tensor(file_path, shape):
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read()
|
||||
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
|
||||
tensor = tensor.view(shape) # 根据你的 shape reshape
|
||||
return tensor
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# page3 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank))
|
||||
# page3_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_page_3_kv_lora_rank_norm.f32',(page_size,kv_lora_rank))
|
||||
|
||||
@@ -320,7 +317,6 @@ def load_fp32_tensor(file_path, shape):
|
||||
# print(f'PE Attention Weights Diff: ave:{diff.mean()}, max:{diff.max()}')
|
||||
|
||||
|
||||
|
||||
# raw_attn_w_1 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug1/query_0_tp_0_raw_attention_weights.f32',(1,max_kvlen))
|
||||
# raw_attn_w_2 = load_fp32_tensor('/home/yzw/xwy/Projects/ktransformers-dev/csrc/ktransformers_ext/examples/debug2/query_0_tp_0_raw_attention_weights.f32',(qlen,max_kvlen))
|
||||
# diff = torch.abs(raw_attn_w_1 - raw_attn_w_2[-1])
|
||||
@@ -334,22 +330,16 @@ def load_fp32_tensor(file_path, shape):
|
||||
# print(f'Output Diff: ave:{diff.mean()}, max:{diff.max()}')
|
||||
|
||||
|
||||
|
||||
|
||||
mla = build_mla()
|
||||
output = torch.zeros((qlen, hidden_size), dtype=input_type).to('cpu').contiguous()
|
||||
mla.forward([qlen],[page_table],[kvlen],hidden_states.data_ptr(),output.data_ptr())
|
||||
print("CPU MLA Output: ",output[-1])
|
||||
output = torch.zeros((qlen, hidden_size), dtype=input_type).to("cpu").contiguous()
|
||||
mla.forward([qlen], [page_table], [kvlen], hidden_states.data_ptr(), output.data_ptr())
|
||||
print("CPU MLA Output: ", output[-1])
|
||||
|
||||
|
||||
output_2 = torch.zeros((1, hidden_size), dtype=input_type).to('cpu').contiguous()
|
||||
mla.forward([1],[page_table],[qlen-1],hidden_states[-1].data_ptr(),output_2.data_ptr())
|
||||
print("CPU MLA Output 2: ",output_2[-1])
|
||||
output_2 = torch.zeros((1, hidden_size), dtype=input_type).to("cpu").contiguous()
|
||||
mla.forward([1], [page_table], [qlen - 1], hidden_states[-1].data_ptr(), output_2.data_ptr())
|
||||
print("CPU MLA Output 2: ", output_2[-1])
|
||||
|
||||
diff = torch.abs(output[-1] - output_2[-1])
|
||||
print(f'Diff: ave:{diff.mean()}, max:{diff.max()}')
|
||||
print(f"Diff: ave:{diff.mean()}, max:{diff.max()}")
|
||||
assert diff.max() < 1e-1, "CPU and Torch outputs are not close enough!"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,59 +1,62 @@
|
||||
import logging
|
||||
import os,sys
|
||||
import os, sys
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
from torch import inf, nn
|
||||
from torch.nn import init
|
||||
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
|
||||
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
|
||||
|
||||
logger = logging.getLogger("reader")
|
||||
|
||||
from gguf.gguf_reader import GGUFReader
|
||||
|
||||
|
||||
|
||||
def load_fp32_tensor_raw(file_path):
|
||||
# return torch.zeros(shape)
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read()
|
||||
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
|
||||
return tensor
|
||||
|
||||
|
||||
def load_fp16_tensor(file_path, shape=None):
|
||||
# return load_fp32_tensor(file_path, shape)
|
||||
return load_fp32_tensor_raw(file_path)
|
||||
# return torch.zeros(shape)
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read()
|
||||
tensor = torch.frombuffer(raw_data, dtype=weight_type)
|
||||
tensor = tensor.view(shape) # 根据你的 shape reshape
|
||||
return tensor
|
||||
|
||||
|
||||
def load_fp32_tensor(file_path, shape):
|
||||
# return torch.zeros(shape)
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read()
|
||||
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
|
||||
tensor = tensor.view(shape) # 根据你的 shape reshape
|
||||
return tensor
|
||||
|
||||
|
||||
def test_torch():
|
||||
torch.set_grad_enabled(False)
|
||||
|
||||
|
||||
hidden_states_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_input.bin')
|
||||
hidden_states_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_input.bin')
|
||||
hidden_states_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_input.bin")
|
||||
hidden_states_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_input.bin")
|
||||
# diff = torch.abs(hidden_states_to_check_prefill - hidden_states_to_check_decode).max()
|
||||
# print("hidden_states diff -> ", diff)
|
||||
|
||||
q_lora_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_qlora.bin')
|
||||
q_lora_to_check_test_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_qlora_test.bin')
|
||||
q_lora_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_qlora.bin')
|
||||
q_lora_to_check_test_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_qlora_test.bin')
|
||||
q_lora_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora.bin")
|
||||
q_lora_to_check_test_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora_test.bin")
|
||||
q_lora_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora.bin")
|
||||
q_lora_to_check_test_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora_test.bin")
|
||||
# diff = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
|
||||
# diff_test = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
|
||||
# print("q_lora max diff -> ", diff)
|
||||
@@ -63,8 +66,6 @@ def test_torch():
|
||||
# print("q_lora mae -> ", mae)
|
||||
# print("q_lora mae test -> ", mae_test)
|
||||
|
||||
|
||||
|
||||
# q_lora_norm = q_a_layernorm(q_lora)
|
||||
# q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
|
||||
# q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
|
||||
@@ -76,7 +77,7 @@ def test_torch():
|
||||
# print("q_lora_norm mae -> ", mae)
|
||||
# print("q_lora_norm diff test -> ", diff_test)
|
||||
# print("q_lora_norm mae test -> ", mae_test)
|
||||
|
||||
|
||||
# q = q_b_proj(q_lora_norm)
|
||||
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
|
||||
# q = q.view(qlen, num_heads, nope_size+rope_size)
|
||||
@@ -85,7 +86,7 @@ def test_torch():
|
||||
# q_nope, q_pe = torch.split(
|
||||
# q, [nope_size, rope_size], dim=-1
|
||||
# )
|
||||
|
||||
|
||||
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
|
||||
# compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
|
||||
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
|
||||
@@ -94,12 +95,11 @@ def test_torch():
|
||||
# )
|
||||
# compressed_kv = compressed_kv.contiguous()
|
||||
|
||||
|
||||
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
|
||||
compressed_kv_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_kv_lora_rank')
|
||||
compressed_kv_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_kv_lora_rank')
|
||||
compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank")
|
||||
compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank")
|
||||
# diff = torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode).max()
|
||||
# mae = torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode))
|
||||
# mae = torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode))
|
||||
# print("compressed_kv diff -> ", diff)
|
||||
# print("compressed_kv mae -> ", mae)
|
||||
|
||||
@@ -107,20 +107,17 @@ def test_torch():
|
||||
# k_pe is [qlen, 1, qk_rope_head_dim(64)]
|
||||
|
||||
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
|
||||
compressed_kv_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm')
|
||||
compressed_kv_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm')
|
||||
compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm")
|
||||
compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm")
|
||||
# diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
|
||||
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
|
||||
# print("compressed_kv diff norm -> ", diff)
|
||||
# print("compressed_kv mae norm -> ", mae)
|
||||
|
||||
|
||||
|
||||
|
||||
# k_pe = k_pe.view(qlen, 1, rope_size)
|
||||
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
|
||||
# compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
|
||||
|
||||
|
||||
# cos, sin = rotary_emb(q_pe, batch_position_ids)
|
||||
|
||||
# q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
|
||||
@@ -135,11 +132,11 @@ def test_torch():
|
||||
# print("q_nope[0] mae -> ", mae)
|
||||
# print("q_nope[0] diff test -> ", diff_test)
|
||||
# print("q_nope[0] mae test -> ", mae_test)
|
||||
|
||||
|
||||
# q_pe_nope = q_pe.transpose(0,1)
|
||||
q_pe_0_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope')
|
||||
q_pe_0_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope')
|
||||
|
||||
q_pe_0_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_q_rope")
|
||||
q_pe_0_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_q_rope")
|
||||
|
||||
# q_pe_0_to_check_decode_test = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope_test')
|
||||
# q_pe_0_to_check_prefill_test = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope_test')
|
||||
|
||||
@@ -180,12 +177,11 @@ def test_torch():
|
||||
# q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
|
||||
# q_pe = q_pe.squeeze(0)
|
||||
# q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
|
||||
# q_pe.transpose_(0, 1)
|
||||
# q_pe.transpose_(0, 1)
|
||||
|
||||
# diff = torch.abs(q_pe - q_new).max()
|
||||
# print("q_pe diff -> ", diff)
|
||||
|
||||
|
||||
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
|
||||
# diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
|
||||
# mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
|
||||
@@ -240,7 +236,7 @@ def test_torch():
|
||||
# if batch_compressed_kv is None or batch_k_pe is None:
|
||||
# batch_compressed_kv = tmp_compressed_kv
|
||||
# batch_k_pe = tmp_k_pe
|
||||
# else:
|
||||
# else:
|
||||
# batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
||||
# batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
||||
# kv_total_len -= page_size
|
||||
@@ -250,16 +246,15 @@ def test_torch():
|
||||
# if batch_compressed_kv is None or batch_k_pe is None:
|
||||
# batch_compressed_kv = tmp_compressed_kv
|
||||
# batch_k_pe = tmp_k_pe
|
||||
# else:
|
||||
# else:
|
||||
# batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
||||
# batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
||||
# break
|
||||
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
|
||||
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
|
||||
|
||||
|
||||
k_pe_to_check_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_page_0_k_rope', (256,64))
|
||||
k_pe_to_check_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_page_0_k_rope', (256,64))
|
||||
k_pe_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_k_rope", (256, 64))
|
||||
k_pe_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_k_rope", (256, 64))
|
||||
# diff = torch.abs(k_pe_to_check_prefill - k_pe_to_check_decode).max()
|
||||
# mae = torch.mean(k_pe_to_check_prefill - k_pe_to_check_decode)
|
||||
# print("k_pe diff -> ", diff)
|
||||
@@ -267,13 +262,13 @@ def test_torch():
|
||||
|
||||
# pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
|
||||
# kv_total_len = kv_page_nums * page_size
|
||||
pe_weights_0_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_pe_attention_weights', (1024,4096))
|
||||
pe_weights_0_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_pe_attention_weights', (1024,4096))
|
||||
pe_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_pe_attention_weights", (1024, 4096))
|
||||
pe_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_pe_attention_weights", (1024, 4096))
|
||||
|
||||
# diff = torch.abs(pe_weights[0] - pe_weights_0).max()
|
||||
# print("pe_weights[0] diff -> ", diff)
|
||||
|
||||
# attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))
|
||||
# attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))
|
||||
|
||||
# raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096))
|
||||
# raw_weights = raw_weights[0:qlen, 0:kv_total_len]
|
||||
@@ -282,25 +277,23 @@ def test_torch():
|
||||
|
||||
# attention_weights = attention_weights * softmax_scale
|
||||
# attention_weights is [num_heads(128), qlen, k_len]
|
||||
|
||||
|
||||
# attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)
|
||||
|
||||
|
||||
# attention_masks[i] is [qlen, k_len]
|
||||
|
||||
|
||||
# attention_weights = (attention_weights + attention_masks)
|
||||
# attention_weights shape is [num_heads(128), qlen, k_len]
|
||||
|
||||
|
||||
# attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype)
|
||||
|
||||
attention_weights_0_decode = load_fp16_tensor('./debug_decode/query_0_tp_0_attention_weights', (1024, 4096))
|
||||
attention_weights_0_prefill = load_fp16_tensor('./debug_prefill/query_0_tp_0_attention_weights', (1024, 4096))
|
||||
attention_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_attention_weights", (1024, 4096))
|
||||
attention_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_attention_weights", (1024, 4096))
|
||||
|
||||
# attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
|
||||
# diff = torch.abs(attention_weights[0] - attention_weights_0).max()
|
||||
# print("attention_weights[0] diff -> ", diff)
|
||||
|
||||
|
||||
# attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
|
||||
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
|
||||
|
||||
@@ -322,7 +315,7 @@ def test_torch():
|
||||
# w_o = o_proj.weight.view([hidden_size,num_heads * nope_size])
|
||||
# output = torch.matmul(attn_output,w_o.transpose(0,1))
|
||||
# output = output.view(qlen, hidden_size)
|
||||
|
||||
|
||||
# output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size))
|
||||
# h1_o = w_o[:,:128]
|
||||
# local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128))
|
||||
@@ -333,18 +326,15 @@ def test_torch():
|
||||
# diff = torch.abs(h1_output - output_0_check).max()
|
||||
# print("h1_output diff -> ", diff)
|
||||
|
||||
|
||||
output_check_decode = load_fp16_tensor('./debug_decode/output.bin')
|
||||
output_check_prefill = load_fp16_tensor('./debug_prefill/output.bin')
|
||||
output_check_decode = load_fp16_tensor("./debug_decode/output.bin")
|
||||
output_check_prefill = load_fp16_tensor("./debug_prefill/output.bin")
|
||||
# diff = torch.abs(output - output_check).max()
|
||||
# mae = torch.mean(torch.abs(output - output_check))
|
||||
# print("output diff -> ", diff)
|
||||
|
||||
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
torch.set_printoptions(sci_mode=False, precision=5)
|
||||
# output_cpu = test_cpu_mla()
|
||||
# output_cpu_quant = test_cpu_mla_quant()
|
||||
@@ -361,7 +351,3 @@ output_torch = test_torch()
|
||||
|
||||
# print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}')
|
||||
# assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
import os,sys
|
||||
import os, sys
|
||||
import time
|
||||
from typing import Optional
|
||||
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import init
|
||||
from torch_attention import apply_rotary_pos_emb,DeepseekV2RMSNorm,KDeepSeekV3Cache,DeepseekV3YarnRotaryEmbedding
|
||||
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
|
||||
|
||||
|
||||
seed = 42 # 你可以选择任何整数作为种子
|
||||
@@ -19,7 +20,7 @@ kvlen = 0
|
||||
|
||||
|
||||
page_table = range(20)
|
||||
bsz_tensors=torch.tensor([1])
|
||||
bsz_tensors = torch.tensor([1])
|
||||
|
||||
|
||||
page_size = 256
|
||||
@@ -38,8 +39,7 @@ rope_theta = 10000
|
||||
max_qlen = 1024
|
||||
max_kvlen = 4096
|
||||
|
||||
max_position_embeddings = 163840
|
||||
|
||||
max_position_embeddings = 163840
|
||||
|
||||
|
||||
rope_scaling = {
|
||||
@@ -49,17 +49,16 @@ rope_scaling = {
|
||||
"mscale": 1.0,
|
||||
"mscale_all_dim": 1.0,
|
||||
"original_max_position_embeddings": 4096,
|
||||
"type": "yarn"
|
||||
"type": "yarn",
|
||||
}
|
||||
|
||||
|
||||
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(64)
|
||||
validation_iter = 100
|
||||
|
||||
|
||||
q_a_proj = nn.Linear(hidden_size, q_lora_rank, bias=False, dtype=torch.float16)
|
||||
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size+rope_size) , bias=False, dtype=torch.float16)
|
||||
q_b_proj = nn.Linear(q_lora_rank, num_heads * (nope_size + rope_size), bias=False, dtype=torch.float16)
|
||||
kv_a_proj_with_mqa = nn.Linear(hidden_size, kv_lora_rank + rope_size, bias=False, dtype=torch.float16)
|
||||
kv_b_proj = nn.Linear(kv_lora_rank, num_heads * (nope_size + nope_size), bias=False, dtype=torch.float16)
|
||||
o_proj = nn.Linear(num_heads * nope_size, hidden_size, bias=False, dtype=torch.float16)
|
||||
@@ -70,13 +69,11 @@ init.normal_(kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
|
||||
init.normal_(kv_b_proj.weight, mean=0.0, std=0.02)
|
||||
init.normal_(o_proj.weight, mean=0.0, std=0.02)
|
||||
|
||||
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to('cpu').contiguous()
|
||||
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to('cpu').contiguous()
|
||||
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to('cpu').to(torch.float16).contiguous()
|
||||
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to('cpu').contiguous()
|
||||
o_proj_weight = o_proj.weight.to(torch.float16).to('cpu').contiguous()
|
||||
|
||||
|
||||
q_a_proj_weight = q_a_proj.weight.to(torch.float16).to("cpu").contiguous()
|
||||
q_b_proj_weight = q_b_proj.weight.to(torch.float16).to("cpu").contiguous()
|
||||
kv_a_proj_with_mqa_weight = kv_a_proj_with_mqa.weight.to("cpu").to(torch.float16).contiguous()
|
||||
kv_b_proj_weight = kv_b_proj.weight.to(torch.float16).to("cpu").contiguous()
|
||||
o_proj_weight = o_proj.weight.to(torch.float16).to("cpu").contiguous()
|
||||
|
||||
|
||||
config = kt_kernel_ext.mla.MLAConfig(
|
||||
@@ -89,7 +86,7 @@ config = kt_kernel_ext.mla.MLAConfig(
|
||||
)
|
||||
config.max_qlen = max_qlen
|
||||
config.max_kvlen = max_kvlen
|
||||
config.max_position_embeddings = max_position_embeddings
|
||||
config.max_position_embeddings = max_position_embeddings
|
||||
config.rope_scaling_factor = rope_scaling["factor"]
|
||||
config.rope_theta = rope_theta
|
||||
config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
|
||||
@@ -114,30 +111,27 @@ config.w_o_type = ggml_type.FP16
|
||||
config.pool = CPUInfer.backend_
|
||||
|
||||
|
||||
|
||||
mla = kt_kernel_ext.mla.MLA(config)
|
||||
mla.load_weights()
|
||||
mla.set_local_pages(pages_count)
|
||||
|
||||
|
||||
|
||||
input = torch.randn((qlen, hidden_size), dtype=torch.float16).to('cpu').contiguous()
|
||||
input = torch.randn((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous()
|
||||
|
||||
|
||||
output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to('cpu').contiguous()
|
||||
mla.forward([qlen],[page_table],[kvlen],input.data_ptr(),output.data_ptr())
|
||||
print("CPU MLA Output: ",output)
|
||||
|
||||
output = torch.zeros((qlen, hidden_size), dtype=torch.float16).to("cpu").contiguous()
|
||||
mla.forward([qlen], [page_table], [kvlen], input.data_ptr(), output.data_ptr())
|
||||
print("CPU MLA Output: ", output)
|
||||
|
||||
|
||||
softmax_scale = (nope_size + rope_size) ** -0.5
|
||||
# 1代表的是压缩的kv的头数
|
||||
k_caches = torch.randn(1,pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16)
|
||||
k_caches = torch.randn(1, pages_count, page_size, 1, kv_lora_rank + rope_size).to(torch.float16)
|
||||
kv_cache = KDeepSeekV3Cache(page_size=page_size, kv_lora_rank=kv_lora_rank, k_caches=k_caches)
|
||||
|
||||
q_a_layernorm = DeepseekV2RMSNorm(q_lora_rank)
|
||||
|
||||
x = torch.randn(q_lora_rank, dtype=torch.float16)*100
|
||||
x = torch.randn(q_lora_rank, dtype=torch.float16) * 100
|
||||
print(x)
|
||||
print(q_a_layernorm(x))
|
||||
|
||||
@@ -163,110 +157,114 @@ rotary_emb = DeepseekV3YarnRotaryEmbedding(
|
||||
# last_page_len = [qlen+kvlen,...] layer_idx = 1
|
||||
# position_ids = [kvlen:qlen+kvlen]
|
||||
hidden_states = torch.randn(qlen, hidden_size, dtype=torch.float16)
|
||||
q_indptr = torch.tensor([0,qlen]).to(torch.int32)
|
||||
q_indptr = torch.tensor([0, qlen]).to(torch.int32)
|
||||
|
||||
kv_indptr = torch.tensor([0,(qlen+kvlen+page_size-1)//page_size]).to(torch.int32)
|
||||
kv_indptr = torch.tensor([0, (qlen + kvlen + page_size - 1) // page_size]).to(torch.int32)
|
||||
kv_indices = torch.tensor(range(pages_count)).to(torch.int32)
|
||||
|
||||
page_idx = torch.tensor([i//page_size for i in range(kvlen,kvlen+qlen)] ).to(torch.int32)
|
||||
page_offset = torch.tensor( [i%page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
|
||||
page_idx = torch.tensor([i // page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
|
||||
page_offset = torch.tensor([i % page_size for i in range(kvlen, kvlen + qlen)]).to(torch.int32)
|
||||
|
||||
last_page_len = torch.tensor([(qlen+kvlen)%page_size], device=hidden_states.device)
|
||||
last_page_len = torch.tensor([(qlen + kvlen) % page_size], device=hidden_states.device)
|
||||
position_ids = torch.tensor(range(kvlen, kvlen + qlen)).to(torch.int32)
|
||||
|
||||
|
||||
# 按照行创建 mask [qlen,kvlen+qlen]
|
||||
attention_masks = torch.zeros((qlen, kvlen + qlen), dtype=torch.float16)
|
||||
for i in range(qlen):
|
||||
attention_masks[i, i + kvlen + 1: i + kvlen + qlen] = -65504.0
|
||||
attention_masks[i, i + kvlen + 1 : i + kvlen + qlen] = -65504.0
|
||||
|
||||
|
||||
def torch_attn(hidden_states: torch.Tensor,
|
||||
kv_cache: KDeepSeekV3Cache,
|
||||
position_ids: torch.Tensor,
|
||||
page_idx: torch.Tensor,
|
||||
page_offset: torch.Tensor,
|
||||
attention_masks: Optional[list[torch.Tensor]] = None,
|
||||
q_indptr: Optional[torch.Tensor] = None,
|
||||
kv_indices: Optional[torch.Tensor] = None,
|
||||
kv_indptr: Optional[torch.Tensor] = None,
|
||||
bsz_tensors: Optional[torch.Tensor] = None,
|
||||
last_page_len: Optional[torch.Tensor] = None,
|
||||
layer_idx: Optional[int] = None,
|
||||
):
|
||||
def torch_attn(
|
||||
hidden_states: torch.Tensor,
|
||||
kv_cache: KDeepSeekV3Cache,
|
||||
position_ids: torch.Tensor,
|
||||
page_idx: torch.Tensor,
|
||||
page_offset: torch.Tensor,
|
||||
attention_masks: Optional[list[torch.Tensor]] = None,
|
||||
q_indptr: Optional[torch.Tensor] = None,
|
||||
kv_indices: Optional[torch.Tensor] = None,
|
||||
kv_indptr: Optional[torch.Tensor] = None,
|
||||
bsz_tensors: Optional[torch.Tensor] = None,
|
||||
last_page_len: Optional[torch.Tensor] = None,
|
||||
layer_idx: Optional[int] = None,
|
||||
):
|
||||
global out_absorb
|
||||
global q_absorb
|
||||
# range bsz_tensors
|
||||
final_attention_output = torch.tensor([], device=hidden_states.device)
|
||||
for i in range(bsz_tensors[0]):
|
||||
batch_num_tokens_tensors = q_indptr[i+1] - q_indptr[i]
|
||||
batch_num_tokens_tensors = q_indptr[i + 1] - q_indptr[i]
|
||||
batch_last_page_len = last_page_len[i]
|
||||
# kv_total_len is kv_len, batch_compressed_kv is compressed_kv, batch_k_pe is k_pe
|
||||
batch_page_idx = page_idx[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_page_offset = page_offset[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_page_idx = page_idx[q_indptr[i] : q_indptr[i + 1]]
|
||||
batch_page_offset = page_offset[q_indptr[i] : q_indptr[i + 1]]
|
||||
# kv_page_nums is the number of pages for the current batch
|
||||
kv_page_nums = kv_indptr[i+1] - kv_indptr[i]
|
||||
kv_page_nums = kv_indptr[i + 1] - kv_indptr[i]
|
||||
# kv_total_len is the total length of the kv cache for the current batch (kv_len for algorithm)
|
||||
kv_total_len = kv_page_nums * page_size
|
||||
if batch_last_page_len is not None:
|
||||
kv_total_len = kv_total_len - (page_size - batch_last_page_len)
|
||||
# print(f"kv_total_len's shape {kv_total_len.shape}")
|
||||
# kv_index is the index of the kv cache pages for the current batch
|
||||
kv_index = kv_indices[kv_indptr[i]:kv_indptr[i+1]]
|
||||
kv_index = kv_indices[kv_indptr[i] : kv_indptr[i + 1]]
|
||||
# we can index [kv_index, page_offset_indices] to get the kv cache for the current batch
|
||||
# from q_indptr[i] to q_indptr[i+1] is the range of the current batch
|
||||
batch_hidden_states = hidden_states[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_position_ids = position_ids[q_indptr[i]:q_indptr[i+1]]
|
||||
batch_hidden_states = hidden_states[q_indptr[i] : q_indptr[i + 1]]
|
||||
batch_position_ids = position_ids[q_indptr[i] : q_indptr[i + 1]]
|
||||
qlen, _ = batch_hidden_states.size()
|
||||
# print("qlen -> ", qlen)
|
||||
q_lora = q_a_proj(batch_hidden_states)
|
||||
print('q_a_proj',q_a_proj.weight)
|
||||
print('q_lora',q_lora)
|
||||
|
||||
print("q_a_proj", q_a_proj.weight)
|
||||
print("q_lora", q_lora)
|
||||
|
||||
q = q_b_proj(q_a_layernorm(q_lora))
|
||||
print('q_b_proj',q_b_proj.weight)
|
||||
print("q_b_proj", q_b_proj.weight)
|
||||
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
|
||||
q = q.view(qlen, num_heads, nope_size+rope_size)
|
||||
q = q.view(qlen, num_heads, nope_size + rope_size)
|
||||
# q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
|
||||
# q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
|
||||
q_nope, q_pe = torch.split(
|
||||
q, [nope_size, rope_size], dim=-1
|
||||
)
|
||||
print('q_nope',q_nope)
|
||||
print('q_pe',q_pe)
|
||||
q_nope, q_pe = torch.split(q, [nope_size, rope_size], dim=-1)
|
||||
print("q_nope", q_nope)
|
||||
print("q_pe", q_pe)
|
||||
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
|
||||
compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
|
||||
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
|
||||
compressed_kv, k_pe = torch.split(
|
||||
compressed_kv, [kv_lora_rank, rope_size], dim=-1
|
||||
)
|
||||
compressed_kv, k_pe = torch.split(compressed_kv, [kv_lora_rank, rope_size], dim=-1)
|
||||
compressed_kv = compressed_kv.contiguous()
|
||||
compressed_kv = kv_a_layernorm(compressed_kv)
|
||||
# k_pe is [qlen, 1, qk_rope_head_dim(64)]
|
||||
print('compressed_kv ',compressed_kv)
|
||||
print('k_pe ',k_pe)
|
||||
print("compressed_kv ", compressed_kv)
|
||||
print("k_pe ", k_pe)
|
||||
k_pe = k_pe.view(qlen, 1, rope_size)
|
||||
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
|
||||
compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
|
||||
|
||||
|
||||
cos, sin = rotary_emb(q_pe, batch_position_ids)
|
||||
# print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
|
||||
q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
|
||||
q_pe = q_pe.squeeze(0)
|
||||
# q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
|
||||
q_pe.transpose_(0, 1)
|
||||
q_pe.transpose_(0, 1)
|
||||
if kv_cache is not None:
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models
|
||||
compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
|
||||
compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
|
||||
k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
|
||||
cache_kwargs = {
|
||||
"sin": sin,
|
||||
"cos": cos,
|
||||
"page_idx": batch_page_idx,
|
||||
"page_offset": batch_page_offset,
|
||||
} # Specific to RoPE models
|
||||
compressed_kv_with_k_pe = kv_cache.update(
|
||||
compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs
|
||||
)
|
||||
compressed_kv = compressed_kv_with_k_pe[:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
|
||||
k_pe = compressed_kv_with_k_pe[:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
|
||||
# q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
|
||||
# out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
|
||||
# q_absorb, out_absorb = get_absorbed()
|
||||
# q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)]
|
||||
q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
|
||||
q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
|
||||
# q_nope is [num_heads(128), qlen, kv_lora_rank(512)]
|
||||
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
|
||||
q_nope = torch.matmul(q_nope, q_absorb) # batched MM
|
||||
|
||||
# # q_nope is [qlen, num_heads(128), kv_lora_rank(512)]
|
||||
# q_nope = q_nope.transpose(0, 1)
|
||||
@@ -281,7 +279,7 @@ def torch_attn(hidden_states: torch.Tensor,
|
||||
if batch_compressed_kv is None or batch_k_pe is None:
|
||||
batch_compressed_kv = tmp_compressed_kv
|
||||
batch_k_pe = tmp_k_pe
|
||||
else:
|
||||
else:
|
||||
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
||||
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
||||
kv_total_len -= page_size
|
||||
@@ -291,57 +289,48 @@ def torch_attn(hidden_states: torch.Tensor,
|
||||
if batch_compressed_kv is None or batch_k_pe is None:
|
||||
batch_compressed_kv = tmp_compressed_kv
|
||||
batch_k_pe = tmp_k_pe
|
||||
else:
|
||||
else:
|
||||
batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
||||
batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
||||
break
|
||||
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
|
||||
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
|
||||
pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
|
||||
print('pe_weights',pe_weights)
|
||||
pe_weights = torch.matmul(q_pe, batch_k_pe.mT)
|
||||
print("pe_weights", pe_weights)
|
||||
attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT)) * softmax_scale
|
||||
# attention_weights is [num_heads(128), qlen, k_len]
|
||||
|
||||
|
||||
# attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)
|
||||
|
||||
|
||||
# attention_masks[i] is [qlen, k_len]
|
||||
|
||||
attention_weights = (attention_weights + attention_masks[i])
|
||||
|
||||
attention_weights = attention_weights + attention_masks[i]
|
||||
# attention_weights shape is [num_heads(128), qlen, k_len]
|
||||
attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=torch.float16).to(q_pe.dtype)
|
||||
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
|
||||
attention_weights = nn.functional.softmax(attention_weights, dim=-1, dtype=torch.float16).to(q_pe.dtype)
|
||||
attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
|
||||
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
|
||||
out_absorb = out_absorb.transpose(1,2)
|
||||
out_absorb = out_absorb.transpose(1, 2)
|
||||
# q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
|
||||
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
|
||||
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
|
||||
attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
|
||||
attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
|
||||
attn_output = attn_output.reshape(qlen, num_heads * nope_size)
|
||||
attn_output = o_proj(attn_output)
|
||||
final_attention_output = torch.cat((final_attention_output, attn_output), dim=0)
|
||||
return final_attention_output
|
||||
|
||||
|
||||
|
||||
torch_output = torch_attn(
|
||||
input,
|
||||
kv_cache,
|
||||
position_ids,
|
||||
page_idx,
|
||||
page_offset,
|
||||
attention_masks=attention_masks,
|
||||
q_indptr=q_indptr,
|
||||
kv_indices=kv_indices,
|
||||
kv_indptr=kv_indptr,
|
||||
bsz_tensors=bsz_tensors,
|
||||
last_page_len=last_page_len,
|
||||
layer_idx=0
|
||||
)
|
||||
print("Torch Output: ",torch_output)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
input,
|
||||
kv_cache,
|
||||
position_ids,
|
||||
page_idx,
|
||||
page_offset,
|
||||
attention_masks=attention_masks,
|
||||
q_indptr=q_indptr,
|
||||
kv_indices=kv_indices,
|
||||
kv_indptr=kv_indptr,
|
||||
bsz_tensors=bsz_tensors,
|
||||
last_page_len=last_page_len,
|
||||
layer_idx=0,
|
||||
)
|
||||
print("Torch Output: ", torch_output)
|
||||
|
||||
@@ -1,36 +1,39 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Description :
|
||||
"""
|
||||
Description :
|
||||
Author : chenht2022
|
||||
Date : 2024-07-25 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : chenht2022
|
||||
LastEditors : chenht2022
|
||||
LastEditTime : 2024-08-06 10:37:28
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys
|
||||
import time
|
||||
sys.path.append(os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
hidden_size = 5120
|
||||
intermediate_size = 3072
|
||||
stride = 32
|
||||
group_max_len = 1024
|
||||
gate_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
up_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
down_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
hidden_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
gate_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
up_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
down_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
hidden_type = 1 # ggml_type::GGML_TYPE_F16
|
||||
qlen = 30
|
||||
layer_num = 10
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(48)
|
||||
validation_iter = 100
|
||||
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
|
||||
|
||||
def mlp_torch(input, gate_proj, up_proj, down_proj):
|
||||
gate_buf = torch.mm(input, gate_proj.t())
|
||||
up_buf = torch.mm(input, up_proj.t())
|
||||
@@ -38,16 +41,35 @@ def mlp_torch(input, gate_proj, up_proj, down_proj):
|
||||
ret = torch.mm(intermediate, down_proj.t())
|
||||
return ret
|
||||
|
||||
|
||||
with torch.inference_mode(mode=True):
|
||||
mlps = []
|
||||
gate_projs = []
|
||||
up_projs = []
|
||||
down_projs = []
|
||||
for _ in range(layer_num):
|
||||
gate_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
|
||||
up_proj = torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
|
||||
down_proj = torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
|
||||
config = kt_kernel_ext.mlp.MLPConfig(hidden_size, intermediate_size, stride, group_max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr(), gate_type, up_type, down_type, hidden_type)
|
||||
gate_proj = (
|
||||
torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
|
||||
)
|
||||
up_proj = (
|
||||
torch.randn((intermediate_size, hidden_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
|
||||
)
|
||||
down_proj = (
|
||||
torch.randn((hidden_size, intermediate_size), dtype=torch.float16, device="cuda").to("cpu").contiguous()
|
||||
)
|
||||
config = kt_kernel_ext.mlp.MLPConfig(
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
stride,
|
||||
group_max_len,
|
||||
gate_proj.data_ptr(),
|
||||
up_proj.data_ptr(),
|
||||
down_proj.data_ptr(),
|
||||
gate_type,
|
||||
up_type,
|
||||
down_type,
|
||||
hidden_type,
|
||||
)
|
||||
mlp = kt_kernel_ext.mlp.MLP(config)
|
||||
gate_projs.append(gate_proj)
|
||||
up_projs.append(up_proj)
|
||||
@@ -61,22 +83,16 @@ with torch.inference_mode(mode=True):
|
||||
output = torch.empty((qlen, hidden_size), dtype=torch.float16).contiguous()
|
||||
input = input / 100
|
||||
|
||||
CPUInfer.submit(
|
||||
mlp.forward(
|
||||
qlen,
|
||||
input.data_ptr(),
|
||||
output.data_ptr()
|
||||
)
|
||||
)
|
||||
CPUInfer.submit(mlp.forward(qlen, input.data_ptr(), output.data_ptr()))
|
||||
CPUInfer.sync()
|
||||
# print('cpuinfer output', output)
|
||||
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
gate_proj = gate_projs[i % layer_num]
|
||||
up_proj = up_projs[i % layer_num]
|
||||
down_proj = down_projs[i % layer_num]
|
||||
t_output = mlp_torch(input, gate_proj, up_proj, down_proj)
|
||||
# print('torch output', t_output)
|
||||
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print('diff = ', diff)
|
||||
assert(diff < 0.001)
|
||||
print("diff = ", diff)
|
||||
assert diff < 0.001
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Description :
|
||||
"""
|
||||
Description :
|
||||
Author : chenht2022
|
||||
Date : 2024-07-25 10:32:05
|
||||
Version : 1.0.0
|
||||
LastEditors : SkqLiao
|
||||
LastEditors : SkqLiao
|
||||
LastEditTime : 2025-03-13 11:38:05
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
'''
|
||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
"""
|
||||
import os, sys
|
||||
import time
|
||||
sys.path.insert(0, os.path.dirname(__file__) + '/../build')
|
||||
import kt_kernel_ext
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from kt_kernel_ext.kvcache import ggml_type
|
||||
@@ -20,7 +21,7 @@ from kt_kernel_ext.kvcache import ggml_type
|
||||
torch.manual_seed(0)
|
||||
|
||||
expert_num = 8
|
||||
hidden_size = 2048 #7168
|
||||
hidden_size = 2048 # 7168
|
||||
intermediate_size = 2048
|
||||
stride = 32
|
||||
group_min_len = 10
|
||||
@@ -39,9 +40,11 @@ layer_num = 1
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(64)
|
||||
validation_iter = 10
|
||||
|
||||
|
||||
def act_fn(x):
|
||||
return x / (1.0 + torch.exp(-x))
|
||||
|
||||
|
||||
def mlp_torch(input, gate_proj, up_proj, down_proj):
|
||||
gate_buf = torch.mm(input, gate_proj.t())
|
||||
up_buf = torch.mm(input, up_proj.t())
|
||||
@@ -49,6 +52,7 @@ def mlp_torch(input, gate_proj, up_proj, down_proj):
|
||||
ret = torch.mm(intermediate, down_proj.t())
|
||||
return ret
|
||||
|
||||
|
||||
def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
|
||||
cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
|
||||
cnts.scatter_(1, expert_ids, 1)
|
||||
@@ -85,10 +89,12 @@ def to_cpuinfer_tensor(tensor, type):
|
||||
size = torch.prod(torch.tensor(tensor.shape, dtype=torch.int32)).item()
|
||||
return kt_kernel_ext.utils.from_float(tensor.data_ptr(), size, type)
|
||||
|
||||
|
||||
def from_cpuinfer_tensor(tensor, size, type):
|
||||
return kt_kernel_ext.utils.to_float(tensor.data_ptr(), size, type)
|
||||
|
||||
qlens = [1,64] #[64, 512, 2048, 8192, 16384]
|
||||
|
||||
qlens = [1, 64] # [64, 512, 2048, 8192, 16384]
|
||||
# gate_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K]
|
||||
# up_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q5_K, ggml_type.Q4_K, ggml_type.Q3_K]
|
||||
# down_types = [ggml_type.FP32, ggml_type.FP16, ggml_type.Q8_0, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q6_K, ggml_type.Q5_K]
|
||||
@@ -96,8 +102,8 @@ gate_types = [ggml_type.Q4_K]
|
||||
up_types = [ggml_type.Q4_K]
|
||||
down_types = [ggml_type.Q6_K]
|
||||
hidden_type = ggml_type.BF16
|
||||
print(f'Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}')
|
||||
print(f'group_max_len: ', group_max_len)
|
||||
print(f"Parameters: expert_num: {expert_num} hidden_size: {hidden_size} intermediate_size: {intermediate_size}")
|
||||
print(f"group_max_len: ", group_max_len)
|
||||
|
||||
for qlen in qlens:
|
||||
for gate_type, up_type, down_type in zip(gate_types, up_types, down_types):
|
||||
@@ -106,18 +112,30 @@ for qlen in qlens:
|
||||
gate_projs = []
|
||||
up_projs = []
|
||||
down_projs = []
|
||||
print('Preparing data...')
|
||||
print("Preparing data...")
|
||||
converted_tensors = []
|
||||
for _ in range(layer_num):
|
||||
size = expert_num * intermediate_size * hidden_size
|
||||
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
|
||||
|
||||
gate_proj = (
|
||||
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
up_proj = (
|
||||
torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device="cuda")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
down_proj = (
|
||||
torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device="cuda")
|
||||
.to("cpu")
|
||||
.contiguous()
|
||||
)
|
||||
|
||||
gate_tensor = to_cpuinfer_tensor(gate_proj, gate_type)
|
||||
up_tensor = to_cpuinfer_tensor(up_proj, up_type)
|
||||
down_tensor = to_cpuinfer_tensor(down_proj, down_type)
|
||||
|
||||
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
|
||||
config.pool = CPUInfer.backend_
|
||||
config.stride = stride
|
||||
@@ -131,59 +149,62 @@ for qlen in qlens:
|
||||
config.down_type = down_type
|
||||
config.hidden_type = hidden_type
|
||||
|
||||
|
||||
moe = kt_kernel_ext.moe.MOE(config)
|
||||
gate_projs.append(gate_proj)
|
||||
up_projs.append(up_proj)
|
||||
down_projs.append(down_proj)
|
||||
down_projs.append(down_proj)
|
||||
CPUInfer.submit(moe.load_weights_task())
|
||||
CPUInfer.sync()
|
||||
moes.append(moe)
|
||||
converted_tensors.append((gate_tensor, up_tensor, down_tensor))
|
||||
print('Finished initialization!')
|
||||
print("Finished initialization!")
|
||||
|
||||
CPUInfer.submit(moes[0].warm_up_task())
|
||||
CPUInfer.sync()
|
||||
print('Warm up finished!')
|
||||
print("Warm up finished!")
|
||||
|
||||
# validation
|
||||
progress_bar = tqdm(range(validation_iter), desc="Starting")
|
||||
total_diff = 0
|
||||
|
||||
|
||||
for i in tqdm(progress_bar):
|
||||
progress_bar.set_description('Round: {}/{}'.format(i + 1, validation_iter))
|
||||
expert_ids = torch.stack([torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]).contiguous()
|
||||
progress_bar.set_description("Round: {}/{}".format(i + 1, validation_iter))
|
||||
expert_ids = torch.stack(
|
||||
[torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
|
||||
).contiguous()
|
||||
weights = torch.rand((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
|
||||
input_proj = torch.randn((qlen, hidden_size), dtype=torch.float32).contiguous() / 100
|
||||
output_proj = torch.empty((qlen, hidden_size), dtype=torch.float32).contiguous()
|
||||
|
||||
|
||||
input_tensor = to_cpuinfer_tensor(input_proj, hidden_type)
|
||||
output_tensor = to_cpuinfer_tensor(output_proj, hidden_type)
|
||||
|
||||
|
||||
qlen_tensor = torch.tensor([qlen], dtype=torch.int32)
|
||||
moe = moes[i % layer_num]
|
||||
CPUInfer.submit(
|
||||
moe.forward_task(
|
||||
moe.forward_task(
|
||||
qlen_tensor.data_ptr(),
|
||||
num_experts_per_tok,
|
||||
expert_ids.data_ptr(),
|
||||
weights.data_ptr(),
|
||||
input_tensor.data_ptr(),
|
||||
num_experts_per_tok,
|
||||
expert_ids.data_ptr(),
|
||||
weights.data_ptr(),
|
||||
input_tensor.data_ptr(),
|
||||
output_tensor.data_ptr(),
|
||||
)
|
||||
)
|
||||
CPUInfer.sync()
|
||||
cpu_output = from_cpuinfer_tensor(output_tensor, qlen * hidden_size, hidden_type)
|
||||
|
||||
gate_proj = gate_projs[i%layer_num]
|
||||
up_proj = up_projs[i%layer_num]
|
||||
down_proj = down_projs[i%layer_num]
|
||||
gate_proj = gate_projs[i % layer_num]
|
||||
up_proj = up_projs[i % layer_num]
|
||||
down_proj = down_projs[i % layer_num]
|
||||
t_output = moe_torch(input_proj, expert_ids, weights, gate_proj, up_proj, down_proj)
|
||||
print('cpuinfer output', cpu_output)
|
||||
print('torch output', t_output)
|
||||
diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean(torch.abs(t_output.flatten()))
|
||||
print("cpuinfer output", cpu_output)
|
||||
print("torch output", t_output)
|
||||
diff = torch.mean(torch.abs(cpu_output.flatten() - t_output.flatten())) / torch.mean(
|
||||
torch.abs(t_output.flatten())
|
||||
)
|
||||
assert diff < 0.5
|
||||
total_diff += diff
|
||||
|
||||
print(f'gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}')
|
||||
print(f'Average diff: {total_diff / validation_iter:.4f}')
|
||||
|
||||
print(f"gate_type: {gate_type}, up_type: {up_type}, down_type: {down_type}")
|
||||
print(f"Average diff: {total_diff / validation_iter:.4f}")
|
||||
|
||||
@@ -4,7 +4,7 @@ sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
print("sys.path:", sys.path)
|
||||
|
||||
import torch
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
|
||||
expert_num = 256
|
||||
hidden_size = 7168
|
||||
|
||||
@@ -15,7 +15,7 @@ import time
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
import torch
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
|
||||
|
||||
expert_num = 16
|
||||
|
||||
@@ -14,7 +14,7 @@ import time
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
||||
os.environ["BLAS_NUM_THREADS"] = "1"
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
import torch
|
||||
|
||||
expert_num = 16
|
||||
|
||||
@@ -15,7 +15,7 @@ from abc import ABC, abstractmethod
|
||||
import os
|
||||
import ctypes
|
||||
|
||||
import kt_kernel_ext
|
||||
from kt_kernel import kt_kernel_ext
|
||||
|
||||
|
||||
class KExpertsCPUBuffer:
|
||||
|
||||
Reference in New Issue
Block a user