mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-03-14 18:37:23 +00:00
558 lines
20 KiB
Python
558 lines
20 KiB
Python
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
"""
|
|
Benchmark write_weight_scale_to_buffer for AMX MOE operators.
|
|
|
|
Supports:
|
|
- FP8: FP8 weights (1 byte) + float32 scales (block-wise)
|
|
- FP8_PERCHANNEL: FP8 weights (1 byte) + float32 per-channel scales
|
|
- BF16: Native BF16 weights (2 bytes), no scales
|
|
|
|
Usage:
|
|
python bench_write_buffer.py # Run all modes
|
|
python bench_write_buffer.py fp8 # Run FP8 only
|
|
python bench_write_buffer.py fp8_perchannel # Run FP8 per-channel only
|
|
python bench_write_buffer.py bf16 # Run BF16 only
|
|
"""
|
|
import json
|
|
import os
|
|
import platform
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
from tqdm import tqdm
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
|
|
|
|
from kt_kernel import kt_kernel_ext
|
|
import torch
|
|
|
|
# Benchmark parameters
|
|
expert_num = 256
|
|
num_experts_per_tok = 8
|
|
gpu_tp_count = 2
|
|
|
|
warm_up_iter = 30
|
|
test_iter = 70
|
|
|
|
gpu_experts_num = expert_num
|
|
|
|
hidden_size = 7168
|
|
intermediate_size = 2048
|
|
group_size = 128 # FP8 uses 128x128 block-wise scales
|
|
max_len = 1
|
|
|
|
physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
|
|
CPUInfer = kt_kernel_ext.CPUInfer(80)
|
|
|
|
|
|
def get_git_commit():
|
|
result = {}
|
|
try:
|
|
commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
|
|
commit_msg = subprocess.check_output(["git", "log", "-1", "--pretty=%B"]).decode("utf-8").strip()
|
|
result["commit"] = commit
|
|
result["commit_message"] = commit_msg
|
|
dirty_output = subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
|
|
result["dirty"] = bool(dirty_output)
|
|
if dirty_output:
|
|
result["dirty_files"] = dirty_output.splitlines()
|
|
except Exception as e:
|
|
result["error"] = str(e)
|
|
return result
|
|
|
|
|
|
def get_system_info():
|
|
info = {}
|
|
info["system_name"] = platform.uname().system
|
|
info["node_name"] = platform.uname().node
|
|
info["cpu_core_count"] = os.cpu_count()
|
|
if os.path.exists("/proc/cpuinfo"):
|
|
with open("/proc/cpuinfo", "r") as f:
|
|
for line in f:
|
|
if "model name" in line:
|
|
info["cpu_model"] = line.split(":", 1)[1].strip()
|
|
break
|
|
if os.path.exists("/proc/meminfo"):
|
|
with open("/proc/meminfo", "r") as f:
|
|
for line in f:
|
|
if "MemTotal" in line:
|
|
mem_kb = float(line.split(":", 1)[1].split()[0])
|
|
info["memory_size_GB"] = round(mem_kb / (1024 * 1024), 2)
|
|
break
|
|
return info
|
|
|
|
|
|
script_path = os.path.abspath(__file__)
|
|
script_dir = os.path.dirname(script_path)
|
|
script_name = os.path.splitext(os.path.basename(script_path))[0]
|
|
json_path = os.path.join(script_dir, script_name + ".jsonl")
|
|
|
|
|
|
def record_results(result, filename=json_path):
|
|
with open(filename, "a") as f:
|
|
f.write(json.dumps(result) + "\n")
|
|
|
|
|
|
def div_up(a, b):
|
|
return (a + b - 1) // b
|
|
|
|
|
|
# ==============================================================================
|
|
# FP8 Functions
|
|
# ==============================================================================
|
|
|
|
|
|
def allocate_weights_fp8():
|
|
per_mat_weight_bytes = hidden_size * intermediate_size
|
|
n_blocks_n_gate_up = div_up(intermediate_size, group_size)
|
|
n_blocks_k = div_up(hidden_size, group_size)
|
|
per_mat_scale_elems_gate_up = n_blocks_n_gate_up * n_blocks_k
|
|
per_mat_scale_elems_down = n_blocks_k * n_blocks_n_gate_up
|
|
|
|
gate_q = (
|
|
torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
|
|
.to("cpu")
|
|
.contiguous()
|
|
)
|
|
up_q = (
|
|
torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
|
|
.to("cpu")
|
|
.contiguous()
|
|
)
|
|
down_q = (
|
|
torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
|
|
.to("cpu")
|
|
.contiguous()
|
|
)
|
|
gate_scale = (
|
|
torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
|
)
|
|
up_scale = (
|
|
torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
|
)
|
|
down_scale = (
|
|
torch.randn(expert_num * per_mat_scale_elems_down, dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
|
)
|
|
|
|
return {
|
|
"gate_q": gate_q,
|
|
"up_q": up_q,
|
|
"down_q": down_q,
|
|
"gate_scale": gate_scale,
|
|
"up_scale": up_scale,
|
|
"down_scale": down_scale,
|
|
"per_mat_weight_bytes": per_mat_weight_bytes,
|
|
"per_mat_scale_elems_gate_up": per_mat_scale_elems_gate_up,
|
|
"per_mat_scale_elems_down": per_mat_scale_elems_down,
|
|
}
|
|
|
|
|
|
def allocate_weights_fp8_perchannel():
|
|
per_mat_weight_bytes = hidden_size * intermediate_size
|
|
per_mat_scale_elems_gate_up = intermediate_size
|
|
per_mat_scale_elems_down = hidden_size
|
|
|
|
gate_q = (
|
|
torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
|
|
.to("cpu")
|
|
.contiguous()
|
|
)
|
|
up_q = (
|
|
torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
|
|
.to("cpu")
|
|
.contiguous()
|
|
)
|
|
down_q = (
|
|
torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8, device="cuda")
|
|
.to("cpu")
|
|
.contiguous()
|
|
)
|
|
gate_scale = (
|
|
torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
|
)
|
|
up_scale = (
|
|
torch.randn(expert_num * per_mat_scale_elems_gate_up, dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
|
)
|
|
down_scale = (
|
|
torch.randn(expert_num * per_mat_scale_elems_down, dtype=torch.float32, device="cuda").to("cpu").contiguous()
|
|
)
|
|
|
|
return {
|
|
"gate_q": gate_q,
|
|
"up_q": up_q,
|
|
"down_q": down_q,
|
|
"gate_scale": gate_scale,
|
|
"up_scale": up_scale,
|
|
"down_scale": down_scale,
|
|
"per_mat_weight_bytes": per_mat_weight_bytes,
|
|
"per_mat_scale_elems_gate_up": per_mat_scale_elems_gate_up,
|
|
"per_mat_scale_elems_down": per_mat_scale_elems_down,
|
|
}
|
|
|
|
|
|
def build_moe_fp8(layer_idx=0):
|
|
"""Build a single FP8 MOE instance."""
|
|
weights = allocate_weights_fp8()
|
|
|
|
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
|
|
config.max_len = max_len
|
|
config.layer_idx = layer_idx
|
|
config.quant_config.bits = 8
|
|
config.quant_config.group_size = group_size
|
|
config.quant_config.zero_point = False
|
|
config.pool = CPUInfer.backend_
|
|
config.gate_proj = weights["gate_q"].data_ptr()
|
|
config.up_proj = weights["up_q"].data_ptr()
|
|
config.down_proj = weights["down_q"].data_ptr()
|
|
config.gate_scale = weights["gate_scale"].data_ptr()
|
|
config.up_scale = weights["up_scale"].data_ptr()
|
|
config.down_scale = weights["down_scale"].data_ptr()
|
|
|
|
moe = kt_kernel_ext.moe.AMXFP8_MOE(config)
|
|
CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
|
|
CPUInfer.sync()
|
|
|
|
buffer_shapes = {
|
|
"per_mat_weight_bytes": weights["per_mat_weight_bytes"],
|
|
"per_mat_scale_elems_gate_up": weights["per_mat_scale_elems_gate_up"],
|
|
"per_mat_scale_elems_down": weights["per_mat_scale_elems_down"],
|
|
}
|
|
|
|
return moe, buffer_shapes, weights
|
|
|
|
|
|
def build_moe_fp8_perchannel(layer_idx=0):
|
|
"""Build a single FP8 per-channel MOE instance."""
|
|
weights = allocate_weights_fp8_perchannel()
|
|
|
|
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
|
|
config.max_len = max_len
|
|
config.layer_idx = layer_idx
|
|
config.quant_config.bits = 8
|
|
config.quant_config.group_size = 0
|
|
config.quant_config.zero_point = False
|
|
config.quant_config.per_channel = True
|
|
config.pool = CPUInfer.backend_
|
|
config.gate_proj = weights["gate_q"].data_ptr()
|
|
config.up_proj = weights["up_q"].data_ptr()
|
|
config.down_proj = weights["down_q"].data_ptr()
|
|
config.gate_scale = weights["gate_scale"].data_ptr()
|
|
config.up_scale = weights["up_scale"].data_ptr()
|
|
config.down_scale = weights["down_scale"].data_ptr()
|
|
|
|
moe = kt_kernel_ext.moe.AMXFP8PerChannel_MOE(config)
|
|
CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
|
|
CPUInfer.sync()
|
|
|
|
buffer_shapes = {
|
|
"per_mat_weight_bytes": weights["per_mat_weight_bytes"],
|
|
"per_mat_scale_elems_gate_up": weights["per_mat_scale_elems_gate_up"],
|
|
"per_mat_scale_elems_down": weights["per_mat_scale_elems_down"],
|
|
}
|
|
|
|
return moe, buffer_shapes, weights
|
|
|
|
|
|
def allocate_buffers_fp8(buffer_shapes):
|
|
"""Allocate output buffers for FP8 single expert."""
|
|
per_mat_weight_bytes = buffer_shapes["per_mat_weight_bytes"]
|
|
per_mat_scale_elems_gate_up = buffer_shapes["per_mat_scale_elems_gate_up"]
|
|
per_mat_scale_elems_down = buffer_shapes["per_mat_scale_elems_down"]
|
|
|
|
weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
|
|
scale_elems_per_expert_per_tp_gate_up = per_mat_scale_elems_gate_up // gpu_tp_count
|
|
scale_elems_per_expert_per_tp_down = per_mat_scale_elems_down // gpu_tp_count
|
|
|
|
w13_weight_bufs = [torch.empty(2 * weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
|
|
w13_scale_bufs = [
|
|
torch.empty(2 * scale_elems_per_expert_per_tp_gate_up, dtype=torch.float32) for _ in range(gpu_tp_count)
|
|
]
|
|
w2_weight_bufs = [torch.empty(weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
|
|
w2_scale_bufs = [torch.empty(scale_elems_per_expert_per_tp_down, dtype=torch.float32) for _ in range(gpu_tp_count)]
|
|
|
|
buffer_ptrs = {
|
|
"w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
|
|
"w13_scale_ptrs": [buf.data_ptr() for buf in w13_scale_bufs],
|
|
"w2_weight_ptrs": [buf.data_ptr() for buf in w2_weight_bufs],
|
|
"w2_scale_ptrs": [buf.data_ptr() for buf in w2_scale_bufs],
|
|
}
|
|
|
|
keep_tensors = {
|
|
"w13_weight_bufs": w13_weight_bufs,
|
|
"w13_scale_bufs": w13_scale_bufs,
|
|
"w2_weight_bufs": w2_weight_bufs,
|
|
"w2_scale_bufs": w2_scale_bufs,
|
|
}
|
|
|
|
return buffer_ptrs, keep_tensors
|
|
|
|
|
|
def allocate_buffers_fp8_perchannel(buffer_shapes):
|
|
"""Allocate output buffers for FP8 per-channel single expert."""
|
|
per_mat_weight_bytes = buffer_shapes["per_mat_weight_bytes"]
|
|
per_mat_scale_elems_gate_up = buffer_shapes["per_mat_scale_elems_gate_up"]
|
|
per_mat_scale_elems_down = buffer_shapes["per_mat_scale_elems_down"]
|
|
|
|
weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
|
|
scale_elems_per_expert_per_tp_gate_up = per_mat_scale_elems_gate_up // gpu_tp_count
|
|
scale_elems_per_expert_per_tp_down = per_mat_scale_elems_down
|
|
|
|
w13_weight_bufs = [torch.empty(2 * weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
|
|
w13_scale_bufs = [
|
|
torch.empty(2 * scale_elems_per_expert_per_tp_gate_up, dtype=torch.float32) for _ in range(gpu_tp_count)
|
|
]
|
|
w2_weight_bufs = [torch.empty(weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
|
|
w2_scale_bufs = [torch.empty(scale_elems_per_expert_per_tp_down, dtype=torch.float32) for _ in range(gpu_tp_count)]
|
|
|
|
buffer_ptrs = {
|
|
"w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
|
|
"w13_scale_ptrs": [buf.data_ptr() for buf in w13_scale_bufs],
|
|
"w2_weight_ptrs": [buf.data_ptr() for buf in w2_weight_bufs],
|
|
"w2_scale_ptrs": [buf.data_ptr() for buf in w2_scale_bufs],
|
|
}
|
|
|
|
keep_tensors = {
|
|
"w13_weight_bufs": w13_weight_bufs,
|
|
"w13_scale_bufs": w13_scale_bufs,
|
|
"w2_weight_bufs": w2_weight_bufs,
|
|
"w2_scale_bufs": w2_scale_bufs,
|
|
}
|
|
|
|
return buffer_ptrs, keep_tensors
|
|
|
|
|
|
# ==============================================================================
|
|
# BF16 Functions
|
|
# ==============================================================================
|
|
|
|
|
|
def allocate_weights_bf16():
|
|
per_mat_weight_elems = hidden_size * intermediate_size
|
|
per_mat_weight_bytes = per_mat_weight_elems * 2 # BF16 = 2 bytes
|
|
|
|
gate_proj = (
|
|
torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
|
)
|
|
up_proj = torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
|
down_proj = (
|
|
torch.randn(expert_num * per_mat_weight_elems, dtype=torch.bfloat16, device="cuda").to("cpu").contiguous()
|
|
)
|
|
|
|
return {
|
|
"gate_proj": gate_proj,
|
|
"up_proj": up_proj,
|
|
"down_proj": down_proj,
|
|
"per_mat_weight_bytes": per_mat_weight_bytes,
|
|
"per_mat_weight_elems": per_mat_weight_elems,
|
|
}
|
|
|
|
|
|
def build_moe_bf16(layer_idx=0):
|
|
"""Build a single BF16 MOE instance."""
|
|
weights = allocate_weights_bf16()
|
|
|
|
config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
|
|
config.max_len = max_len
|
|
config.layer_idx = layer_idx
|
|
config.pool = CPUInfer.backend_
|
|
config.gate_proj = weights["gate_proj"].data_ptr()
|
|
config.up_proj = weights["up_proj"].data_ptr()
|
|
config.down_proj = weights["down_proj"].data_ptr()
|
|
config.gate_scale = 0
|
|
config.up_scale = 0
|
|
config.down_scale = 0
|
|
|
|
moe = kt_kernel_ext.moe.AMXBF16_MOE(config)
|
|
CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
|
|
CPUInfer.sync()
|
|
|
|
buffer_shapes = {
|
|
"per_mat_weight_bytes": weights["per_mat_weight_bytes"],
|
|
"per_mat_weight_elems": weights["per_mat_weight_elems"],
|
|
}
|
|
|
|
return moe, buffer_shapes, weights
|
|
|
|
|
|
def allocate_buffers_bf16(buffer_shapes):
|
|
"""Allocate output buffers for BF16 single expert (no scales)."""
|
|
per_mat_weight_bytes = buffer_shapes["per_mat_weight_bytes"]
|
|
|
|
weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
|
|
|
|
w13_weight_bufs = [torch.empty(2 * weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
|
|
w2_weight_bufs = [torch.empty(weight_bytes_per_expert_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)]
|
|
# Dummy scale buffers (not used for BF16 but needed for interface)
|
|
w13_scale_bufs = [torch.empty(1, dtype=torch.float32) for _ in range(gpu_tp_count)]
|
|
w2_scale_bufs = [torch.empty(1, dtype=torch.float32) for _ in range(gpu_tp_count)]
|
|
|
|
buffer_ptrs = {
|
|
"w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
|
|
"w13_scale_ptrs": [buf.data_ptr() for buf in w13_scale_bufs],
|
|
"w2_weight_ptrs": [buf.data_ptr() for buf in w2_weight_bufs],
|
|
"w2_scale_ptrs": [buf.data_ptr() for buf in w2_scale_bufs],
|
|
}
|
|
|
|
keep_tensors = {
|
|
"w13_weight_bufs": w13_weight_bufs,
|
|
"w13_scale_bufs": w13_scale_bufs,
|
|
"w2_weight_bufs": w2_weight_bufs,
|
|
"w2_scale_bufs": w2_scale_bufs,
|
|
}
|
|
|
|
return buffer_ptrs, keep_tensors
|
|
|
|
|
|
# ==============================================================================
|
|
# Benchmark Functions
|
|
# ==============================================================================
|
|
|
|
|
|
def bench_write_buffer(quant_mode: str):
|
|
"""Benchmark write_weight_scale_to_buffer for specified quant mode."""
|
|
print(f"\n{'='*60}")
|
|
print(f"{quant_mode.upper()} write_weight_scale_to_buffer benchmark")
|
|
print(f"{'='*60}")
|
|
|
|
if quant_mode == "fp8":
|
|
bytes_per_elem = 1.0
|
|
moe_0, buffer_shapes, keep_tensors_0 = build_moe_fp8(layer_idx=0)
|
|
moe_1, _, keep_tensors_1 = build_moe_fp8(layer_idx=1)
|
|
buffer_ptrs, buffer_keep = allocate_buffers_fp8(buffer_shapes)
|
|
|
|
# Calculate total bytes including scales
|
|
total_weights = hidden_size * intermediate_size * expert_num * 3
|
|
total_scale_bytes = (
|
|
(buffer_shapes["per_mat_scale_elems_gate_up"] * 2 + buffer_shapes["per_mat_scale_elems_down"])
|
|
* expert_num
|
|
* 4
|
|
)
|
|
bytes_per_call = total_weights + total_scale_bytes
|
|
|
|
elif quant_mode == "fp8_perchannel":
|
|
bytes_per_elem = 1.0
|
|
moe_0, buffer_shapes, keep_tensors_0 = build_moe_fp8_perchannel(layer_idx=0)
|
|
moe_1, _, keep_tensors_1 = build_moe_fp8_perchannel(layer_idx=1)
|
|
buffer_ptrs, buffer_keep = allocate_buffers_fp8_perchannel(buffer_shapes)
|
|
|
|
total_weights = hidden_size * intermediate_size * expert_num * 3
|
|
total_scale_bytes = (
|
|
(buffer_shapes["per_mat_scale_elems_gate_up"] * 2 + buffer_shapes["per_mat_scale_elems_down"])
|
|
* expert_num
|
|
* 4
|
|
)
|
|
bytes_per_call = total_weights + total_scale_bytes
|
|
|
|
elif quant_mode == "bf16":
|
|
bytes_per_elem = 2.0
|
|
moe_0, buffer_shapes, keep_tensors_0 = build_moe_bf16(layer_idx=0)
|
|
moe_1, _, keep_tensors_1 = build_moe_bf16(layer_idx=1)
|
|
buffer_ptrs, buffer_keep = allocate_buffers_bf16(buffer_shapes)
|
|
|
|
# BF16: only weights, no scales
|
|
bytes_per_call = hidden_size * intermediate_size * expert_num * 3 * 2 # BF16 = 2 bytes
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported quant_mode: {quant_mode}")
|
|
|
|
moes = [moe_0, moe_1]
|
|
|
|
# Warm-up
|
|
for _ in tqdm(range(warm_up_iter), desc=f"[{quant_mode.upper()}] Warm-up"):
|
|
for moe_idx, moe in enumerate(moes):
|
|
for expert_id in range(gpu_experts_num):
|
|
CPUInfer.submit(
|
|
moe.write_weight_scale_to_buffer_task(gpu_tp_count=gpu_tp_count, expert_id=expert_id, **buffer_ptrs)
|
|
)
|
|
CPUInfer.sync()
|
|
|
|
# Benchmark
|
|
total_time = 0
|
|
for iter_idx in tqdm(range(test_iter), desc=f"[{quant_mode.upper()}] Testing"):
|
|
start = time.perf_counter()
|
|
for moe_idx, moe in enumerate(moes):
|
|
for expert_id in range(gpu_experts_num):
|
|
CPUInfer.submit(
|
|
moe.write_weight_scale_to_buffer_task(gpu_tp_count=gpu_tp_count, expert_id=expert_id, **buffer_ptrs)
|
|
)
|
|
CPUInfer.sync()
|
|
end = time.perf_counter()
|
|
iter_time = end - start
|
|
total_time += iter_time
|
|
# print(f" Iter {iter_idx}: {iter_time*1000:.2f} ms")
|
|
time.sleep(0.3)
|
|
|
|
# bytes_per_call is for one MOE, we have 2 MOEs
|
|
bytes_per_iter = bytes_per_call * 2
|
|
time_per_iter_ms = total_time / test_iter * 1000
|
|
bandwidth_gbs = bytes_per_iter * test_iter / total_time / 1e9
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"{quant_mode.upper()} write_weight_scale_to_buffer Results (2 MOEs alternating)")
|
|
print(f"{'='*60}")
|
|
print(f"Time per iteration: {time_per_iter_ms:.2f} ms")
|
|
print(f"Bandwidth: {bandwidth_gbs:.2f} GB/s")
|
|
print(f"Experts per MOE: {gpu_experts_num}, MOEs: 2")
|
|
print(f"Time per expert: {time_per_iter_ms/(gpu_experts_num*2)*1000:.2f} us")
|
|
|
|
result = {
|
|
"op": f"write_weight_scale_to_buffer_{quant_mode}",
|
|
"quant_mode": quant_mode,
|
|
"time_per_iteration_ms": time_per_iter_ms,
|
|
"bandwidth_GBs": bandwidth_gbs,
|
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
"test_parameters": {
|
|
"expert_num": expert_num,
|
|
"hidden_size": hidden_size,
|
|
"intermediate_size": intermediate_size,
|
|
"gpu_tp_count": gpu_tp_count,
|
|
"bytes_per_iter": bytes_per_iter,
|
|
"num_moes": 2,
|
|
},
|
|
}
|
|
if quant_mode == "fp8":
|
|
result["test_parameters"]["group_size"] = group_size
|
|
|
|
result.update(get_git_commit())
|
|
result.update(get_system_info())
|
|
record_results(result)
|
|
|
|
return bandwidth_gbs
|
|
|
|
|
|
def main(quant_modes=None):
|
|
"""Run benchmarks for specified quant modes."""
|
|
if quant_modes is None:
|
|
quant_modes = ["fp8", "fp8_perchannel", "bf16"]
|
|
|
|
results = {}
|
|
for mode in quant_modes:
|
|
try:
|
|
bandwidth = bench_write_buffer(mode)
|
|
results[mode] = f"PASSED ({bandwidth:.2f} GB/s)"
|
|
except Exception as e:
|
|
results[mode] = f"FAILED: {e}"
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
for mode, result in results.items():
|
|
print(f" {mode.upper()}: {result}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) > 1:
|
|
mode = sys.argv[1].lower()
|
|
if mode in ["fp8", "fp8_perchannel", "bf16"]:
|
|
main([mode])
|
|
else:
|
|
print(f"Unknown mode: {mode}. Use 'fp8', 'fp8_perchannel' or 'bf16'")
|
|
sys.exit(1)
|
|
else:
|
|
main()
|