Files
composable_kernel/example/ck_tile/01_fmha/script/benchmark_fwd.sh
Jeff Huang 518d24e662 Add sequence padding and variable length support in fmha (#2932)
* * [CK_TILE] Add sequence padding and variable length support in fmha (and v3)

 - Group Mode Padding: Introduces the `-s_qpad` argument to support
   physically padded layouts. Kernels now use padded start pointers
   (`seqstart_padded_*_ptr`) for memory addressing.

 - Batch Mode Variable Length: Adds `-q_eff_lens` and `-kv_eff_lens`
   arguments for efficient processing of variable-length sequences by
   passing cumulative effective lengths (`cu_seqlen_*_ptr`) to the kernel.

 - FMHA examples: Support padding and variable length both in
   group and batch mode. Dispatcher is updated as well (dispatch to
   kPadSeqLenK enabled pipeline).

 - New padding test cases: Add padding test cases to `smoke_test_fwd.sh` and
   `test_fmha_fwd.inc`, and add benchmarks to `benchmark_fwd.sh` and
   `benchmark_fwd_v3.sh` as well. These test cases and benchmarks that
   specifically validate/benchmark the new padding and variable-length
   functionalities in both group and batch modes.

* [CK_TILE] Fix build error in fmha unit tests

* [CK_TILE] add mqa, gqa to sequence padding unit tests

* [CI_TILE] Reduce the number of padding seqlen unit tests in FMHA to avoid timeouts in CI

* [CK_TILE] remove unnecessary MageKArgs overload in FmhaFwdV3Kernel and FmhaFwdKernel
2025-09-26 12:36:27 +08:00

54 lines
2.1 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/sh
# TODO: run this script from CK root or build directory
EXE="$(find . -name tile_example_fmha_fwd -type f | head -n 1)"
VALID=0
for prec in "fp16" "bf16" ; do
for perm in 0 1 ; do
for hdim in 64 128 256 ; do
nhead=$((2048 / $hdim)) # follow fav2 setup
$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=8 -h=$nhead -d=$hdim -s=2048 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=4 -h=$nhead -d=$hdim -s=4096 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=2 -h=$nhead -d=$hdim -s=8192 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
$EXE -prec=$prec -b=1 -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
done
done
done
#Padding Benchmarks: batch mode (baseline vs low/med/high pad)
prec="fp16"
base_batch_args="-prec=$prec -mode=0 -b=4 -h=16 -h_k=16 -d=128 -s=1024 -bias=n -mask=0 -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=$VALID"
# baseline (no pad)
$EXE $base_batch_args
# low pad (≈9095% effective)
$EXE $base_batch_args -q_eff_lens=1024,960,992,896 -kv_eff_lens=1024,960,992,896
# medium pad (≈6075% effective)
$EXE $base_batch_args -q_eff_lens=896,768,512,640 -kv_eff_lens=896,768,512,640
# high pad (≈3040% effective)
$EXE $base_batch_args -q_eff_lens=512,384,256,320 -kv_eff_lens=512,384,256,320
# Padding Benchmarks: group mode (baseline vs low/med/high physical pad)
seqlens_q="1024,768,512,256"
seqlens_k="1024,768,512,256"
base_group_args="-prec=$prec -mode=1 -b=4 -h=16 -h_k=16 -d=128 -s=$seqlens_q -s_k=$seqlens_k -bias=n -mask=0 -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=$VALID"
# baseline (no physical pad)
$EXE $base_group_args
# low physical pad
$EXE $base_group_args -s_qpad=1152,896,576,320 -s_kpad=1152,896,576,320
# medium physical pad
$EXE $base_group_args -s_qpad=1536,1152,768,384 -s_kpad=1536,1152,768,384
# high physical pad
$EXE $base_group_args -s_qpad=2048,1536,1024,512 -s_kpad=2048,1536,1024,512