[FMHA] Support page_size=1 (linear layout) in batch prefill pipeline (#3545)

- Enable page_size=1 support in batch prefill codegen (linear layout only). - Implement per-token page lookup in `kv_offset_array_transform` for page_size=1 to handle 3D input tensors correctly. - Relax `kPageBlockSize` alignment assertion for the page_size=1 case.
2026-04-19 22:39:03 +00:00 · 2026-01-13 12:04:43 +08:00
parent a575acb245
commit c9f112b026
2 changed files with 62 additions and 38 deletions
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -36,7 +36,7 @@ DTYPE_BITS = {

 K0_MAX_SUBMAX_MAP = {32: 32, 64: 64, 96: 128, 128: 128, 256: 256}

-SUPPORTED_PAGE_SIZE = [128, 256, 1024]
+SUPPORTED_PAGE_SIZE = [1, 128, 256, 1024]
 SUPPORTED_KV_MEMORY_LAYOUT = ["vectorized", "linear"]
 SUPPORTED_KV_LOOKUP_TABLE = ["vllm", "sglang"]
 KV_MEMORY_LAYOUT_ENUM_MAP = {
@@ -737,6 +737,8 @@ def get_fwd_blobs(

                # Generate kernels for both page_size=16 and page_size=1024
                for page_size in SUPPORTED_PAGE_SIZE:
+                    if page_size == 1 and pipeline.F_kv_memory_layout != "linear":
+                        continue
                    k = FmhaFwdKernel(
                        F_idx=0,
                        F_hdim=hdim,