Kt minimax (#1742)

[feat]: fp8 kernel and kt-cli support
This commit is contained in:
ErvinXie
2025-12-24 15:39:44 +08:00
committed by GitHub
parent e7d277d163
commit d8046e1bb4
65 changed files with 12111 additions and 2502 deletions

View File

@@ -17,6 +17,7 @@ register_cpu_ci(est_time=30, suite="default")
# Check if kt_kernel_ext is available
try:
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
HAS_KT_KERNEL = True
except ImportError:
@@ -51,7 +52,7 @@ def test_basic_module_attributes():
pytest.skip("kt_kernel_ext not built or available")
# Check for key attributes/functions
assert hasattr(kt_kernel_ext, 'CPUInfer'), "kt_kernel_ext should have CPUInfer class"
assert hasattr(kt_kernel_ext, "CPUInfer"), "kt_kernel_ext should have CPUInfer class"
def run_all_tests():

View File

@@ -20,6 +20,7 @@ register_cpu_ci(est_time=120, suite="default")
try:
import torch
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
HAS_DEPS = True
except ImportError as e:
@@ -68,9 +69,7 @@ def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
if num_tokens == 0:
continue
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
expert_out = mlp_torch(
tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]
)
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
outputs.append(expert_out)
start_idx = end_idx
@@ -96,9 +95,7 @@ def test_moe_amx_int4_accuracy():
pytest.skip(f"Dependencies not available: {import_error}")
global physical_to_logical_map
physical_to_logical_map = torch.tensor(
data=range(expert_num), device="cpu", dtype=torch.int64
).contiguous()
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
CPUInfer = kt_kernel_ext.CPUInfer(60)
@@ -133,9 +130,7 @@ def test_moe_amx_int4_accuracy():
)
# Create MOE config
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
config.max_len = max_len
config.gate_proj = gate_proj.data_ptr()
config.up_proj = up_proj.data_ptr()
@@ -176,14 +171,10 @@ def test_moe_amx_int4_accuracy():
CPUInfer.sync()
# Run torch reference
t_output = moe_torch(
input_data, expert_ids, weights, gate_proj, up_proj, down_proj
)
t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)
# Calculate relative difference
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(
torch.abs(t_output)
)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print(f"Iteration {i}, diff = {diff:.6f}")
# INT4 should have diff < 0.35
@@ -205,6 +196,7 @@ def run_all_tests():
except Exception as e:
print(f"\n✗ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -20,6 +20,7 @@ register_cpu_ci(est_time=120, suite="default")
try:
import torch
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
HAS_DEPS = True
except ImportError as e:
@@ -68,9 +69,7 @@ def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
if num_tokens == 0:
continue
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
expert_out = mlp_torch(
tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]
)
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
outputs.append(expert_out)
start_idx = end_idx
@@ -96,9 +95,7 @@ def test_moe_amx_int4_1_accuracy():
pytest.skip(f"Dependencies not available: {import_error}")
global physical_to_logical_map
physical_to_logical_map = torch.tensor(
data=range(expert_num), device="cpu", dtype=torch.int64
).contiguous()
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
CPUInfer = kt_kernel_ext.CPUInfer(60)
@@ -133,9 +130,7 @@ def test_moe_amx_int4_1_accuracy():
)
# Create MOE config
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
config.max_len = max_len
config.gate_proj = gate_proj.data_ptr()
config.up_proj = up_proj.data_ptr()
@@ -176,14 +171,10 @@ def test_moe_amx_int4_1_accuracy():
CPUInfer.sync()
# Run torch reference
t_output = moe_torch(
input_data, expert_ids, weights, gate_proj, up_proj, down_proj
)
t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)
# Calculate relative difference
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(
torch.abs(t_output)
)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print(f"Iteration {i}, diff = {diff:.6f}")
# INT4_1 should have diff < 0.35
@@ -205,6 +196,7 @@ def run_all_tests():
except Exception as e:
print(f"\n✗ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -20,6 +20,7 @@ register_cpu_ci(est_time=120, suite="default")
try:
import torch
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
HAS_DEPS = True
except ImportError as e:
@@ -69,9 +70,7 @@ def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
if num_tokens == 0:
continue
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
expert_out = mlp_torch(
tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]
)
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
outputs.append(expert_out)
start_idx = end_idx
@@ -97,9 +96,7 @@ def test_moe_amx_int4_1k_accuracy():
pytest.skip(f"Dependencies not available: {import_error}")
global physical_to_logical_map
physical_to_logical_map = torch.tensor(
data=range(expert_num), device="cpu", dtype=torch.int64
).contiguous()
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
CPUInfer = kt_kernel_ext.CPUInfer(60)
@@ -134,9 +131,7 @@ def test_moe_amx_int4_1k_accuracy():
)
# Create MOE config
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
config.max_len = max_len
config.gate_proj = gate_proj.data_ptr()
config.up_proj = up_proj.data_ptr()
@@ -180,14 +175,10 @@ def test_moe_amx_int4_1k_accuracy():
CPUInfer.sync()
# Run torch reference
t_output = moe_torch(
input_data, expert_ids, weights, gate_proj, up_proj, down_proj
)
t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)
# Calculate relative difference
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(
torch.abs(t_output)
)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print(f"Iteration {i}, diff = {diff:.6f}")
# INT4_1K should have diff < 0.35
@@ -209,6 +200,7 @@ def run_all_tests():
except Exception as e:
print(f"\n✗ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -20,6 +20,7 @@ register_cpu_ci(est_time=120, suite="default")
try:
import torch
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
HAS_DEPS = True
except ImportError as e:
@@ -68,9 +69,7 @@ def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
if num_tokens == 0:
continue
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
expert_out = mlp_torch(
tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i]
)
expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
outputs.append(expert_out)
start_idx = end_idx
@@ -96,9 +95,7 @@ def test_moe_amx_int8_accuracy():
pytest.skip(f"Dependencies not available: {import_error}")
global physical_to_logical_map
physical_to_logical_map = torch.tensor(
data=range(expert_num), device="cpu", dtype=torch.int64
).contiguous()
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
CPUInfer = kt_kernel_ext.CPUInfer(60)
@@ -133,9 +130,7 @@ def test_moe_amx_int8_accuracy():
)
# Create MOE config
config = kt_kernel_ext.moe.MOEConfig(
expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
)
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
config.max_len = max_len
config.gate_proj = gate_proj.data_ptr()
config.up_proj = up_proj.data_ptr()
@@ -174,14 +169,10 @@ def test_moe_amx_int8_accuracy():
CPUInfer.sync()
# Run torch reference
t_output = moe_torch(
input_data, expert_ids, weights, gate_proj, up_proj, down_proj
)
t_output = moe_torch(input_data, expert_ids, weights, gate_proj, up_proj, down_proj)
# Calculate relative difference
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(
torch.abs(t_output)
)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print(f"Iteration {i}, diff = {diff:.6f}")
# INT8 should have diff < 0.05
@@ -203,6 +194,7 @@ def run_all_tests():
except Exception as e:
print(f"\n✗ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -24,8 +24,10 @@ register_cpu_ci(est_time=300, suite="default")
try:
import torch
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
from tqdm import tqdm
HAS_DEPS = True
except ImportError as e:
HAS_DEPS = False
@@ -306,6 +308,7 @@ def run_all_tests():
except Exception as e:
print(f"\n✗ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -24,6 +24,7 @@ register_cpu_ci(est_time=300, suite="default")
try:
import torch
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
from tqdm import tqdm

View File

@@ -25,8 +25,10 @@ register_cpu_ci(est_time=300, suite="default")
try:
import torch
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
from tqdm import tqdm
HAS_DEPS = True
except ImportError as e:
HAS_DEPS = False
@@ -156,11 +158,7 @@ def test_moe_amx_int4_1k_benchmark():
CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
# Physical to logical map for weight loading
physical_to_logical_map = torch.tensor(
data=range(expert_num),
device="cpu",
dtype=torch.int64
).contiguous()
physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
# Initialize MOE layers
moes = []
@@ -322,6 +320,7 @@ def run_all_tests():
except Exception as e:
print(f"\nTest failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -24,8 +24,10 @@ register_cpu_ci(est_time=300, suite="default")
try:
import torch
import kt_kernel # Import kt_kernel first to register kt_kernel_ext
kt_kernel_ext = kt_kernel.kt_kernel_ext # Access the extension module
from tqdm import tqdm
HAS_DEPS = True
except ImportError as e:
HAS_DEPS = False
@@ -51,7 +53,6 @@ worker_config_dict = {
CPUINFER_PARAM = 60
def get_git_commit():
"""Get current git commit information."""
result = {}
@@ -307,6 +308,7 @@ def run_all_tests():
except Exception as e:
print(f"\n✗ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)