mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-20 14:29:22 +00:00
@@ -17,6 +17,7 @@ register_cpu_ci(est_time=30, suite="default")
|
||||
# Check if kt_kernel_ext is available
|
||||
try:
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
HAS_KT_KERNEL = True
|
||||
except ImportError:
|
||||
@@ -51,7 +52,7 @@ def test_basic_module_attributes():
|
||||
pytest.skip("kt_kernel_ext not built or available")
|
||||
|
||||
# Check for key attributes/functions
|
||||
assert hasattr(kt_kernel_ext, 'CPUInfer'), "kt_kernel_ext should have CPUInfer class"
|
||||
assert hasattr(kt_kernel_ext, "CPUInfer"), "kt_kernel_ext should have CPUInfer class"
|
||||
|
||||
|
||||
def run_all_tests():
|
||||
|
||||
@@ -20,6 +20,7 @@ register_cpu_ci(est_time=120, suite="default")
|
||||
try:
|
||||
import torch
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
HAS_DEPS = True
|
||||
except ImportError as e:
|
||||
@@ -68,9 +69,7 @@ def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
|
||||
if num_tokens == 0:
|
||||
continue
|
||||
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
|
||||
expert_out = mlp_torch(
|
||||
tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]
|
||||
)
|
||||
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
|
||||
outputs.append(expert_out)
|
||||
start_idx = end_idx
|
||||
|
||||
@@ -96,9 +95,7 @@ def test_moe_amx_int4_accuracy():
|
||||
pytest.skip(f"Dependencies not available: {import_error}")
|
||||
|
||||
global physical_to_logical_map
|
||||
physical_to_logical_map = torch.tensor(
|
||||
data=range(expert_num), device="cpu", dtype=torch.int64
|
||||
).contiguous()
|
||||
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
|
||||
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(60)
|
||||
|
||||
@@ -133,9 +130,7 @@ def test_moe_amx_int4_accuracy():
|
||||
)
|
||||
|
||||
# Create MOE config
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
|
||||
)
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
|
||||
config.max_len = max_len
|
||||
config.gate_proj = gate_proj.data_ptr()
|
||||
config.up_proj = up_proj.data_ptr()
|
||||
@@ -176,14 +171,10 @@ def test_moe_amx_int4_accuracy():
|
||||
CPUInfer.sync()
|
||||
|
||||
# Run torch reference
|
||||
t_output = moe_torch(
|
||||
input_data, expert_ids, weights, gate_proj, up_proj, down_proj
|
||||
)
|
||||
t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)
|
||||
|
||||
# Calculate relative difference
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(
|
||||
torch.abs(t_output)
|
||||
)
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print(f"Iteration {i}, diff = {diff:.6f}")
|
||||
|
||||
# INT4 should have diff < 0.35
|
||||
@@ -205,6 +196,7 @@ def run_all_tests():
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ register_cpu_ci(est_time=120, suite="default")
|
||||
try:
|
||||
import torch
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
HAS_DEPS = True
|
||||
except ImportError as e:
|
||||
@@ -68,9 +69,7 @@ def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
|
||||
if num_tokens == 0:
|
||||
continue
|
||||
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
|
||||
expert_out = mlp_torch(
|
||||
tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]
|
||||
)
|
||||
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
|
||||
outputs.append(expert_out)
|
||||
start_idx = end_idx
|
||||
|
||||
@@ -96,9 +95,7 @@ def test_moe_amx_int4_1_accuracy():
|
||||
pytest.skip(f"Dependencies not available: {import_error}")
|
||||
|
||||
global physical_to_logical_map
|
||||
physical_to_logical_map = torch.tensor(
|
||||
data=range(expert_num), device="cpu", dtype=torch.int64
|
||||
).contiguous()
|
||||
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
|
||||
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(60)
|
||||
|
||||
@@ -133,9 +130,7 @@ def test_moe_amx_int4_1_accuracy():
|
||||
)
|
||||
|
||||
# Create MOE config
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
|
||||
)
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
|
||||
config.max_len = max_len
|
||||
config.gate_proj = gate_proj.data_ptr()
|
||||
config.up_proj = up_proj.data_ptr()
|
||||
@@ -176,14 +171,10 @@ def test_moe_amx_int4_1_accuracy():
|
||||
CPUInfer.sync()
|
||||
|
||||
# Run torch reference
|
||||
t_output = moe_torch(
|
||||
input_data, expert_ids, weights, gate_proj, up_proj, down_proj
|
||||
)
|
||||
t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)
|
||||
|
||||
# Calculate relative difference
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(
|
||||
torch.abs(t_output)
|
||||
)
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print(f"Iteration {i}, diff = {diff:.6f}")
|
||||
|
||||
# INT4_1 should have diff < 0.35
|
||||
@@ -205,6 +196,7 @@ def run_all_tests():
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ register_cpu_ci(est_time=120, suite="default")
|
||||
try:
|
||||
import torch
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
HAS_DEPS = True
|
||||
except ImportError as e:
|
||||
@@ -69,9 +70,7 @@ def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
|
||||
if num_tokens == 0:
|
||||
continue
|
||||
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
|
||||
expert_out = mlp_torch(
|
||||
tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]
|
||||
)
|
||||
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
|
||||
outputs.append(expert_out)
|
||||
start_idx = end_idx
|
||||
|
||||
@@ -97,9 +96,7 @@ def test_moe_amx_int4_1k_accuracy():
|
||||
pytest.skip(f"Dependencies not available: {import_error}")
|
||||
|
||||
global physical_to_logical_map
|
||||
physical_to_logical_map = torch.tensor(
|
||||
data=range(expert_num), device="cpu", dtype=torch.int64
|
||||
).contiguous()
|
||||
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
|
||||
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(60)
|
||||
|
||||
@@ -134,9 +131,7 @@ def test_moe_amx_int4_1k_accuracy():
|
||||
)
|
||||
|
||||
# Create MOE config
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
|
||||
)
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
|
||||
config.max_len = max_len
|
||||
config.gate_proj = gate_proj.data_ptr()
|
||||
config.up_proj = up_proj.data_ptr()
|
||||
@@ -180,14 +175,10 @@ def test_moe_amx_int4_1k_accuracy():
|
||||
CPUInfer.sync()
|
||||
|
||||
# Run torch reference
|
||||
t_output = moe_torch(
|
||||
input_data, expert_ids, weights, gate_proj, up_proj, down_proj
|
||||
)
|
||||
t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)
|
||||
|
||||
# Calculate relative difference
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(
|
||||
torch.abs(t_output)
|
||||
)
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print(f"Iteration {i}, diff = {diff:.6f}")
|
||||
|
||||
# INT4_1K should have diff < 0.35
|
||||
@@ -209,6 +200,7 @@ def run_all_tests():
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ register_cpu_ci(est_time=120, suite="default")
|
||||
try:
|
||||
import torch
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
HAS_DEPS = True
|
||||
except ImportError as e:
|
||||
@@ -68,9 +69,7 @@ def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
|
||||
if num_tokens == 0:
|
||||
continue
|
||||
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
|
||||
expert_out = mlp_torch(
|
||||
tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]
|
||||
)
|
||||
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
|
||||
outputs.append(expert_out)
|
||||
start_idx = end_idx
|
||||
|
||||
@@ -96,9 +95,7 @@ def test_moe_amx_int8_accuracy():
|
||||
pytest.skip(f"Dependencies not available: {import_error}")
|
||||
|
||||
global physical_to_logical_map
|
||||
physical_to_logical_map = torch.tensor(
|
||||
data=range(expert_num), device="cpu", dtype=torch.int64
|
||||
).contiguous()
|
||||
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
|
||||
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(60)
|
||||
|
||||
@@ -133,9 +130,7 @@ def test_moe_amx_int8_accuracy():
|
||||
)
|
||||
|
||||
# Create MOE config
|
||||
config = kt_kernel_ext.moe.MOEConfig(
|
||||
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
|
||||
)
|
||||
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
|
||||
config.max_len = max_len
|
||||
config.gate_proj = gate_proj.data_ptr()
|
||||
config.up_proj = up_proj.data_ptr()
|
||||
@@ -174,14 +169,10 @@ def test_moe_amx_int8_accuracy():
|
||||
CPUInfer.sync()
|
||||
|
||||
# Run torch reference
|
||||
t_output = moe_torch(
|
||||
input_data, expert_ids, weights, gate_proj, up_proj, down_proj
|
||||
)
|
||||
t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)
|
||||
|
||||
# Calculate relative difference
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(
|
||||
torch.abs(t_output)
|
||||
)
|
||||
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
|
||||
print(f"Iteration {i}, diff = {diff:.6f}")
|
||||
|
||||
# INT8 should have diff < 0.05
|
||||
@@ -203,6 +194,7 @@ def run_all_tests():
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -24,8 +24,10 @@ register_cpu_ci(est_time=300, suite="default")
|
||||
try:
|
||||
import torch
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
from tqdm import tqdm
|
||||
|
||||
HAS_DEPS = True
|
||||
except ImportError as e:
|
||||
HAS_DEPS = False
|
||||
@@ -306,6 +308,7 @@ def run_all_tests():
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ register_cpu_ci(est_time=300, suite="default")
|
||||
try:
|
||||
import torch
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
@@ -25,8 +25,10 @@ register_cpu_ci(est_time=300, suite="default")
|
||||
try:
|
||||
import torch
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
from tqdm import tqdm
|
||||
|
||||
HAS_DEPS = True
|
||||
except ImportError as e:
|
||||
HAS_DEPS = False
|
||||
@@ -156,11 +158,7 @@ def test_moe_amx_int4_1k_benchmark():
|
||||
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
|
||||
|
||||
# Physical to logical map for weight loading
|
||||
physical_to_logical_map = torch.tensor(
|
||||
data=range(expert_num),
|
||||
device="cpu",
|
||||
dtype=torch.int64
|
||||
).contiguous()
|
||||
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
|
||||
|
||||
# Initialize MOE layers
|
||||
moes = []
|
||||
@@ -322,6 +320,7 @@ def run_all_tests():
|
||||
except Exception as e:
|
||||
print(f"\nTest failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -24,8 +24,10 @@ register_cpu_ci(est_time=300, suite="default")
|
||||
try:
|
||||
import torch
|
||||
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
|
||||
|
||||
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
|
||||
from tqdm import tqdm
|
||||
|
||||
HAS_DEPS = True
|
||||
except ImportError as e:
|
||||
HAS_DEPS = False
|
||||
@@ -51,7 +53,6 @@ worker_config_dict = {
|
||||
CPUINFER_PARAM = 60
|
||||
|
||||
|
||||
|
||||
def get_git_commit():
|
||||
"""Get current git commit information."""
|
||||
result = {}
|
||||
@@ -307,6 +308,7 @@ def run_all_tests():
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user