mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-20 14:29:22 +00:00
* fix(amx): add BufferASmallKGroupImpl to fix buffer overflow in from_mat The original BufferAKGroupImpl::from_mat writes 64 bytes per K_STEP iteration but when K_STEP=32 (for GemmKernel224Int4SmallKGroup), this causes buffer overflow. BufferASmallKGroupImpl overrides from_mat to write only 32 bytes per iteration. * perf(k2-moe): optimize memory allocation with pooled buffers - Replace per-expert buffer allocation with shared memory pools - Dynamically assign buffer slices based on activated experts - Add group_size inference from scale tensor shape in amx.py * delete kimi k2 forward test * add TODO comment for pool_count_ calculation
This commit is contained in:
@@ -404,8 +404,16 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
|
||||
moe_config.pool = self.cpu_infer.backend_
|
||||
moe_config.max_len = self.chunked_prefill_size
|
||||
|
||||
# Infer group_size from scale shape (column-major layout)
|
||||
# For gate/up projection: in_features = hidden_size
|
||||
# So: group_size = hidden_size / scale.shape[1]
|
||||
scale_shape = self.gate_scales[0].shape
|
||||
group_size = self.hidden_size // scale_shape[1]
|
||||
print(f"[RAWAMXMoEWrapper Layer {self.layer_idx}] Inferred group_size: {group_size}")
|
||||
|
||||
moe_config.quant_config.bits = 4
|
||||
moe_config.quant_config.group_size = 32
|
||||
moe_config.quant_config.group_size = group_size
|
||||
|
||||
moe_config.quant_config.zero_point = False
|
||||
|
||||
# Use gate_projs instead of gate_proj for per-expert pointers
|
||||
|
||||
Reference in New Issue
Block a user