mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-20 06:18:59 +00:00
354 lines
17 KiB
Python
354 lines
17 KiB
Python
import logging
|
|
import os, sys
|
|
import time
|
|
from typing import Optional
|
|
|
|
os.environ["BLAS_NUM_THREADS"] = "1"
|
|
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
|
|
from kt_kernel import kt_kernel_ext
|
|
from kt_kernel_ext.kvcache import ggml_type
|
|
import torch
|
|
from torch import inf, nn
|
|
from torch.nn import init
|
|
from torch_attention import apply_rotary_pos_emb, DeepseekV2RMSNorm, KDeepSeekV3Cache, DeepseekV3YarnRotaryEmbedding
|
|
|
|
logger = logging.getLogger("reader")
|
|
|
|
from gguf.gguf_reader import GGUFReader
|
|
|
|
|
|
def load_fp32_tensor_raw(file_path):
|
|
# return torch.zeros(shape)
|
|
with open(file_path, "rb") as f:
|
|
raw_data = f.read()
|
|
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
|
|
return tensor
|
|
|
|
|
|
def load_fp16_tensor(file_path, shape=None):
|
|
# return load_fp32_tensor(file_path, shape)
|
|
return load_fp32_tensor_raw(file_path)
|
|
# return torch.zeros(shape)
|
|
with open(file_path, "rb") as f:
|
|
raw_data = f.read()
|
|
tensor = torch.frombuffer(raw_data, dtype=weight_type)
|
|
tensor = tensor.view(shape) # 根据你的 shape reshape
|
|
return tensor
|
|
|
|
|
|
def load_fp32_tensor(file_path, shape):
|
|
# return torch.zeros(shape)
|
|
with open(file_path, "rb") as f:
|
|
raw_data = f.read()
|
|
tensor = torch.frombuffer(raw_data, dtype=torch.float32)
|
|
tensor = tensor.view(shape) # 根据你的 shape reshape
|
|
return tensor
|
|
|
|
|
|
def test_torch():
|
|
torch.set_grad_enabled(False)
|
|
|
|
hidden_states_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_input.bin")
|
|
hidden_states_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_input.bin")
|
|
# diff = torch.abs(hidden_states_to_check_prefill - hidden_states_to_check_decode).max()
|
|
# print("hidden_states diff -> ", diff)
|
|
|
|
q_lora_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora.bin")
|
|
q_lora_to_check_test_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_qlora_test.bin")
|
|
q_lora_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora.bin")
|
|
q_lora_to_check_test_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_qlora_test.bin")
|
|
# diff = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
|
|
# diff_test = torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode).max()
|
|
# print("q_lora max diff -> ", diff)
|
|
# print("q_lora max diff test -> ", diff_test)
|
|
# mae = torch.mean(torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode))
|
|
# mae_test = torch.mean(torch.abs(q_lora_to_check_prefill - q_lora_to_check_decode))
|
|
# print("q_lora mae -> ", mae)
|
|
# print("q_lora mae test -> ", mae_test)
|
|
|
|
# q_lora_norm = q_a_layernorm(q_lora)
|
|
# q_lora_norm_to_check = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm.bin', q_lora_norm.shape)
|
|
# q_lora_norm_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_qlora_norm_test.bin', q_lora_norm.shape)
|
|
# diff = torch.abs(q_lora_norm - q_lora_norm_to_check).max()
|
|
# mae = torch.mean(torch.abs(q_lora_norm - q_lora_norm_to_check))
|
|
# diff_test = torch.abs(q_lora_norm - q_lora_norm_to_check_test).max()
|
|
# mae_test = torch.mean(torch.abs(q_lora_norm - q_lora_norm_to_check_test))
|
|
# print("q_lora_norm diff -> ", diff)
|
|
# print("q_lora_norm mae -> ", mae)
|
|
# print("q_lora_norm diff test -> ", diff_test)
|
|
# print("q_lora_norm mae test -> ", mae_test)
|
|
|
|
# q = q_b_proj(q_lora_norm)
|
|
# for v3, bsz, qlen, num_heads(128), qk_head_dim(192=128(nope)+64(rope))
|
|
# q = q.view(qlen, num_heads, nope_size+rope_size)
|
|
# q_nope is [qlen, num_heads(128), qk_nope_head_dim(128)]
|
|
# q_pe is [qlen, num_heads(128), qk_rope_head_dim(64)]
|
|
# q_nope, q_pe = torch.split(
|
|
# q, [nope_size, rope_size], dim=-1
|
|
# )
|
|
|
|
# compressed_kv is [qlen, kv_lora_rank(512) + rope(64)]
|
|
# compressed_kv = kv_a_proj_with_mqa(batch_hidden_states)
|
|
# compressed_kv is [qlen, kv_lora_rank(512)], k_pe is [qlen, rope(64)]
|
|
# compressed_kv, k_pe = torch.split(
|
|
# compressed_kv, [kv_lora_rank, rope_size], dim=-1
|
|
# )
|
|
# compressed_kv = compressed_kv.contiguous()
|
|
|
|
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
|
|
compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank")
|
|
compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank")
|
|
# diff = torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode).max()
|
|
# mae = torch.mean(torch.abs(compressed_kv_to_check_prefill - compressed_kv_to_check_decode))
|
|
# print("compressed_kv diff -> ", diff)
|
|
# print("compressed_kv mae -> ", mae)
|
|
|
|
# compressed_kv = kv_a_layernorm(compressed_kv)
|
|
# k_pe is [qlen, 1, qk_rope_head_dim(64)]
|
|
|
|
# compressed_kv_page_0 = compressed_kv[0:page_size, :]
|
|
compressed_kv_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_kv_lora_rank_norm")
|
|
compressed_kv_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_kv_lora_rank_norm")
|
|
# diff = torch.abs(compressed_kv_page_0 - compressed_kv_to_check).max()
|
|
# mae = torch.mean(torch.abs(compressed_kv_page_0 - compressed_kv_to_check))
|
|
# print("compressed_kv diff norm -> ", diff)
|
|
# print("compressed_kv mae norm -> ", mae)
|
|
|
|
# k_pe = k_pe.view(qlen, 1, rope_size)
|
|
# compressed_kv is [qlen, 1, kv_lora_rank(512)]
|
|
# compressed_kv = compressed_kv.view(qlen, 1, kv_lora_rank)
|
|
|
|
# cos, sin = rotary_emb(q_pe, batch_position_ids)
|
|
|
|
# q_nope_check = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
|
|
|
|
# q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope_check[0].shape)
|
|
# q_nope_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_nope_test', q_nope_check[0].shape)
|
|
# diff = torch.abs(q_nope_check[0] - q_nope_0_to_check).max()
|
|
# mae = torch.mean(torch.abs(q_nope_check[0] - q_nope_0_to_check))
|
|
# diff_test = torch.abs(q_nope_check[0] - q_nope_0_to_check_test).max()
|
|
# mae_test = torch.mean(torch.abs(q_nope_check[0] - q_nope_0_to_check_test))
|
|
# print("q_nope[0] diff -> ", diff)
|
|
# print("q_nope[0] mae -> ", mae)
|
|
# print("q_nope[0] diff test -> ", diff_test)
|
|
# print("q_nope[0] mae test -> ", mae_test)
|
|
|
|
# q_pe_nope = q_pe.transpose(0,1)
|
|
q_pe_0_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_q_rope")
|
|
q_pe_0_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_q_rope")
|
|
|
|
# q_pe_0_to_check_decode_test = load_fp16_tensor('./debug_decode/query_0_tp_0_q_rope_test')
|
|
# q_pe_0_to_check_prefill_test = load_fp16_tensor('./debug_prefill/query_0_tp_0_q_rope_test')
|
|
|
|
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope', q_pe_nope[0].shape)
|
|
# q_pe_0_to_check_test = load_fp16_tensor('./debug/query_0_tp_0_q_rope_no_rope_test', q_pe_nope[0].shape)
|
|
# diff = torch.abs(q_pe_nope[0] - q_pe_0_to_check).max()
|
|
# mae = torch.mean(torch.abs(q_pe_nope[0] - q_pe_0_to_check))
|
|
# diff_test = torch.abs(q_pe_nope[0] - q_pe_0_to_check_test).max()
|
|
# mae_test = torch.mean(torch.abs(q_pe_nope[0] - q_pe_0_to_check_test))
|
|
# print("q_pe nope[0] diff -> ", diff)
|
|
# print("q_pe nope[0] mae -> ", mae)
|
|
# print("q_pe nope[0] diff test -> ", diff_test)
|
|
# print("q_pe nope[0] mae test -> ", mae_test)
|
|
|
|
# cos_to_check = load_fp32_tensor('./debug/query_0_tp_0_rope_cos', (qlen,32))
|
|
# diff = torch.abs(cos[:,:32]-cos_to_check).max()
|
|
# mae = torch.mean(torch.abs(cos[:,:32]-cos_to_check))
|
|
# print("cos diff -> ", diff)
|
|
# print("cos mae -> ", mae)
|
|
# sin_to_check = load_fp32_tensor('./debug/query_0_tp_0_rope_sin', (qlen,32))
|
|
# diff = torch.abs(sin[:,:32]-sin_to_check).max()
|
|
# mae = torch.mean(torch.abs(sin[:,:32]-sin_to_check))
|
|
# print("sin diff -> ", diff)
|
|
# print("sin mae -> ", mae)
|
|
|
|
# new_q_pe = q_pe.transpose(0, 1)
|
|
# qa = new_q_pe[:,:,range(0,64,2)]
|
|
# qb = new_q_pe[:,:,range(1,65,2)]
|
|
# q1 = (qa * cos[:,:32] - qb * sin[:,:32])
|
|
# q2 = (qb*cos[:,:32] + qa*sin[:,:32])
|
|
# q1 = (qa * cos_to_check - qb * sin_to_check)
|
|
# q2 = (qb*cos_to_check + qa*sin_to_check)
|
|
# q_new = torch.cat((q1,q2), dim=-1)
|
|
# print(f"q_pe shape{q_pe.shape}, k_pe shape {k_pe.shape}")
|
|
# new_q_pe = torch.zeros_like(q_pe)
|
|
# new_q_pe[:,:,range(0,64,2)] = 1
|
|
# new_q_pe[:,:,range(1,65,2)] = 10
|
|
# q_pe, k_pe = apply_rotary_pos_emb(q_pe.unsqueeze(0), k_pe.unsqueeze(0), cos, sin, unsqueeze_dim=1)
|
|
# q_pe = q_pe.squeeze(0)
|
|
# q_pe is [num_heads(128), qlen, qk_rope_head_dim(64)]
|
|
# q_pe.transpose_(0, 1)
|
|
|
|
# diff = torch.abs(q_pe - q_new).max()
|
|
# print("q_pe diff -> ", diff)
|
|
|
|
# q_pe_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_rope', q_pe[0].shape)
|
|
# diff = torch.abs(q_pe[0] - q_pe_0_to_check).max()
|
|
# mae = torch.mean(torch.abs(q_pe[0] - q_pe_0_to_check))
|
|
# print("q_pe[0] diff -> ", diff)
|
|
# print("q_pe[0] mae -> ", mae)
|
|
|
|
# diff = torch.abs(q_pe_0_to_check - q_new[0]).max()
|
|
# mae = torch.mean(torch.abs(q_pe_0_to_check - q_new[0]))
|
|
# print("q_pe[0] 2 diff -> ", diff)
|
|
# print("q_pe[0] 2 mae -> ", mae)
|
|
|
|
# if kv_cache is not None:
|
|
# cache_kwargs = {"sin": sin, "cos": cos, "page_idx": batch_page_idx, "page_offset": batch_page_offset} # Specific to RoPE models
|
|
# compressed_kv_with_k_pe = kv_cache.update(compressed_kv.unsqueeze(0), k_pe, layer_idx, batch_page_idx, batch_page_offset, cache_kwargs)
|
|
# compressed_kv = compressed_kv_with_k_pe [:, :, :, :kv_lora_rank].view(-1, page_size, kv_lora_rank)
|
|
# k_pe = compressed_kv_with_k_pe [:, :, :, kv_lora_rank:].view(-1, page_size, rope_size)
|
|
# # q_absorb is [num_heads(128), qk_nope_head_dim(128), kv_lora_rank(512)]
|
|
# # out_absorb is [num_heads(128), kv_lora_rank(512), v_head_dim(128)] v_head_dim is also the nope dim
|
|
# # q_absorb, out_absorb = get_absorbed()
|
|
# # q_nope is [num_heads(128), qlen, qk_nope_head_dim(128)]
|
|
# q_nope = q_nope.transpose(0, 1) # qlen is 1, no GPU overhead, same below
|
|
|
|
# q_nope_0_to_check = load_fp16_tensor('./debug/query_0_tp_0_q_nope', q_nope[0].shape)
|
|
# diff = torch.abs(q_nope[0] - q_nope_0_to_check).max()
|
|
# mae = torch.mean(torch.abs(q_nope[0] - q_nope_0_to_check))
|
|
# print("q_nope[0] diff -> ", diff)
|
|
|
|
# # q_nope is [num_heads(128), qlen, kv_lora_rank(512)]
|
|
# q_nope = torch.matmul(q_nope, q_absorb) # batched MM
|
|
|
|
# k_b_proj_check = load_fp16_tensor('./debug/query_0_tp_0_k_b_lora', (nope_size,kv_lora_rank))
|
|
# diff = torch.abs(q_absorb[0] - k_b_proj_check).max()
|
|
# print("kv b lora weight[0] diff -> ", diff)
|
|
|
|
# q_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_q_absorb', (kv_lora_rank,1024))
|
|
# q_absorb_check = q_absorb_check[:,0:qlen].transpose(0,1)
|
|
# diff = torch.abs(q_nope[0] - q_absorb_check).max()
|
|
# mae = torch.mean(torch.abs(q_nope[0] - q_absorb_check))
|
|
# print("q_nope absorb diff -> ", diff)
|
|
# print("q_nope absorb mae -> ", mae)
|
|
|
|
# # q_nope is [qlen, num_heads(128), kv_lora_rank(512)]
|
|
# q_nope = q_nope.transpose(0, 1)
|
|
|
|
# we need to index out the compressed_kv and k_pe for the current batch
|
|
# batch_compressed_kv = None
|
|
# batch_k_pe = None
|
|
# for page_index in kv_index:
|
|
# if kv_total_len > page_size:
|
|
# tmp_compressed_kv = compressed_kv[page_index, 0:page_size, :]
|
|
# tmp_k_pe = k_pe[page_index, 0:page_size, :]
|
|
# if batch_compressed_kv is None or batch_k_pe is None:
|
|
# batch_compressed_kv = tmp_compressed_kv
|
|
# batch_k_pe = tmp_k_pe
|
|
# else:
|
|
# batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
|
# batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
|
# kv_total_len -= page_size
|
|
# else:
|
|
# tmp_compressed_kv = compressed_kv[page_index, 0:kv_total_len, :]
|
|
# tmp_k_pe = k_pe[page_index, 0:kv_total_len, :]
|
|
# if batch_compressed_kv is None or batch_k_pe is None:
|
|
# batch_compressed_kv = tmp_compressed_kv
|
|
# batch_k_pe = tmp_k_pe
|
|
# else:
|
|
# batch_compressed_kv = torch.cat((batch_compressed_kv, tmp_compressed_kv), dim=0)
|
|
# batch_k_pe = torch.cat((batch_k_pe, tmp_k_pe), dim=0)
|
|
# break
|
|
# batch_compressed_kv is [kv_total_len(k_len), kv_lora_rank(512)]
|
|
# batch_k_pe is [kv_total_len(k_len), qk_rope_head_dim(64)]
|
|
|
|
k_pe_to_check_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_page_0_k_rope", (256, 64))
|
|
k_pe_to_check_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_page_0_k_rope", (256, 64))
|
|
# diff = torch.abs(k_pe_to_check_prefill - k_pe_to_check_decode).max()
|
|
# mae = torch.mean(k_pe_to_check_prefill - k_pe_to_check_decode)
|
|
# print("k_pe diff -> ", diff)
|
|
# print("k_pe mae -> ", mae)
|
|
|
|
# pe_weights = torch.matmul(q_pe,batch_k_pe.mT)
|
|
# kv_total_len = kv_page_nums * page_size
|
|
pe_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_pe_attention_weights", (1024, 4096))
|
|
pe_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_pe_attention_weights", (1024, 4096))
|
|
|
|
# diff = torch.abs(pe_weights[0] - pe_weights_0).max()
|
|
# print("pe_weights[0] diff -> ", diff)
|
|
|
|
# attention_weights = (pe_weights + torch.matmul(q_nope, batch_compressed_kv.mT))
|
|
|
|
# raw_weights = load_fp16_tensor('./debug/query_0_tp_0_raw_attention_weights', (1024, 4096))
|
|
# raw_weights = raw_weights[0:qlen, 0:kv_total_len]
|
|
# diff = torch.abs(attention_weights[0] - raw_weights).max()
|
|
# print("raw attention_weigh/ts[0] diff -> ", diff)
|
|
|
|
# attention_weights = attention_weights * softmax_scale
|
|
# attention_weights is [num_heads(128), qlen, k_len]
|
|
|
|
# attention_weights = attention_weights.transpose(0,1).unsqueeze(0).squeeze(-1).expand(qlen,-1,-1).transpose(0,1)
|
|
|
|
# attention_masks[i] is [qlen, k_len]
|
|
|
|
# attention_weights = (attention_weights + attention_masks)
|
|
# attention_weights shape is [num_heads(128), qlen, k_len]
|
|
|
|
# attention_weights = nn.functional.softmax(attention_weights,dim=-1,dtype=weight_type).to(q_pe.dtype)
|
|
|
|
attention_weights_0_decode = load_fp16_tensor("./debug_decode/query_0_tp_0_attention_weights", (1024, 4096))
|
|
attention_weights_0_prefill = load_fp16_tensor("./debug_prefill/query_0_tp_0_attention_weights", (1024, 4096))
|
|
|
|
# attention_weights_0 = attention_weights_0[0:qlen, 0:kv_total_len]
|
|
# diff = torch.abs(attention_weights[0] - attention_weights_0).max()
|
|
# print("attention_weights[0] diff -> ", diff)
|
|
|
|
# attn_output = torch.matmul(attention_weights, batch_compressed_kv) # [num_heads(128),qlen, lora_rank(512)]
|
|
# out_absorb shape is [num_heads(128), kv_lora_rank(512), v_head_dim(128)]
|
|
|
|
# o_absorb_check = load_fp16_tensor('./debug/query_0_tp_0_o_absorb', (qlen,kv_lora_rank))
|
|
# diff = torch.abs(attn_output[0] - o_absorb_check).max()
|
|
# print("o absorb[0] diff -> ", diff)
|
|
|
|
# out_absorb = out_absorb.transpose(1, 2) # [qlen, num_heads(128), v_head_dim(128)]
|
|
# # q for qlen, n for num_heads, h for v_head_dim, v for kv_lora_rank
|
|
# attn_output = torch.matmul(attn_output, out_absorb) # [num_heads(128), qlen, v_head_dim(128)]
|
|
|
|
# attn_output_check_0 = load_fp16_tensor('./debug/query_0_tp_0_attention_output', (qlen, nope_size))
|
|
# diff = torch.abs(attn_output[0] - attn_output_check_0).max()
|
|
# print("attn_output[0] diff -> ", diff)
|
|
|
|
# attn_output = attn_output.transpose(0, 1) # [qlen, num_heads(128), v_head_dim(128)]
|
|
# attn_output = attn_output.reshape(qlen, num_heads * nope_size)
|
|
|
|
# w_o = o_proj.weight.view([hidden_size,num_heads * nope_size])
|
|
# output = torch.matmul(attn_output,w_o.transpose(0,1))
|
|
# output = output.view(qlen, hidden_size)
|
|
|
|
# output_0_check = load_fp16_tensor('./debug/query_0_tp_0_qlen_output', (qlen, hidden_size))
|
|
# h1_o = w_o[:,:128]
|
|
# local_o_check = load_fp16_tensor('./debug/query_0_tp_0_local_w_o', (hidden_size, 128))
|
|
# diff = torch.abs(local_o_check - h1_o).max()
|
|
# print("local w_o diff -> ", diff)
|
|
|
|
# h1_output = torch.matmul(attn_output[:,:128],h1_o.transpose(0,1))
|
|
# diff = torch.abs(h1_output - output_0_check).max()
|
|
# print("h1_output diff -> ", diff)
|
|
|
|
output_check_decode = load_fp16_tensor("./debug_decode/output.bin")
|
|
output_check_prefill = load_fp16_tensor("./debug_prefill/output.bin")
|
|
# diff = torch.abs(output - output_check).max()
|
|
# mae = torch.mean(torch.abs(output - output_check))
|
|
# print("output diff -> ", diff)
|
|
|
|
return None
|
|
|
|
|
|
torch.set_printoptions(sci_mode=False, precision=5)
|
|
# output_cpu = test_cpu_mla()
|
|
# output_cpu_quant = test_cpu_mla_quant()
|
|
output_torch = test_torch()
|
|
# print("Output CPU: ", output_cpu)
|
|
# print("Output CPU: ", output_cpu_quant)
|
|
# print("Output Torch: ", output_torch)
|
|
# diff = (output_cpu - output_torch).abs()
|
|
# # 计算相对误差
|
|
# diff_relative = diff / (output_cpu.abs())
|
|
# # 把 diff_relative 中的 NaN 替换为 0
|
|
# diff_relative = torch.where(torch.isnan(diff_relative), torch.zeros_like(diff_relative), diff_relative)
|
|
# diff_relative_mean = torch.mean(torch.abs(output_cpu-output_torch)) / torch.mean(torch.abs(output_torch))
|
|
|
|
# print(f'Diff: ave:{diff.mean()}, max:{diff.max()}, min:{diff.min()}, relative_mean:{diff_relative_mean}, relative_max:{diff_relative.max()}, relative_min:{diff_relative.min()}')
|
|
# assert diff_relative_mean < 2e-1, "CPU and Torch outputs are not close enough!"
|