Reduce CPU memory usage during large chunk prefill (Fixes #1676) (#1683)

* fix(amx): add BufferASmallKGroupImpl to fix buffer overflow in from_mat The original BufferAKGroupImpl::from_mat writes 64 bytes per K_STEP iteration but when K_STEP=32 (for GemmKernel224Int4SmallKGroup), this causes buffer overflow. BufferASmallKGroupImpl overrides from_mat to write only 32 bytes per iteration. * perf(k2-moe): optimize memory allocation with pooled buffers - Replace per-expert buffer allocation with shared memory pools - Dynamically assign buffer slices based on activated experts - Add group_size inference from scale tensor shape in amx.py * delete kimi k2 forward test * add TODO comment for pool_count_ calculation
2026-04-20 14:29:22 +00:00 · 2025-12-08 20:19:07 +08:00
parent eefc8cf98d
commit 8139c092bf
4 changed files with 212 additions and 14 deletions
--- a/kt-kernel/python/utils/amx.py
+++ b/kt-kernel/python/utils/amx.py
@@ -404,8 +404,16 @@ class RAWAMXMoEWrapper(BaseMoEWrapper):
        moe_config.pool = self.cpu_infer.backend_
        moe_config.max_len = self.chunked_prefill_size

+        # Infer group_size from scale shape (column-major layout)
+        # For gate/up projection: in_features = hidden_size
+        # So: group_size = hidden_size / scale.shape[1]
+        scale_shape = self.gate_scales[0].shape
+        group_size = self.hidden_size // scale_shape[1]
+        print(f"[RAWAMXMoEWrapper Layer {self.layer_idx}] Inferred group_size: {group_size}")
+
        moe_config.quant_config.bits = 4
-        moe_config.quant_config.group_size = 32
+        moe_config.quant_config.group_size = group_size
+        
        moe_config.quant_config.zero_point = False

        # Use gate_projs instead of gate_proj for per-expert pointers