diff --git a/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh b/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh new file mode 100755 index 0000000000..e9eba8681e --- /dev/null +++ b/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh @@ -0,0 +1,238 @@ +#!/bin/bash + +BUILD=build +EXE="$BUILD/bin/tile_example_hstu_attention" + +set -x + +## the following cases are generated by using sparsity = 0.95 + +## for batches 4 +for T in "bf16"; do + ## for max_target 32 + tgts=13,20,3,15 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 128 + tgts=87,4,117,115 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 160 + tgts=98,11,127,41 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 256 + tgts=230,64,105,232 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 300 + tgts=49,127,151,227 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 3200 + tgts=1487,286,1582,1691 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 +done + +## for batches 8 +for T in "bf16"; do + ## for max_target 32 + tgts=12,16,24,30,28,6,6,5 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 128 + tgts=22,11,86,32,14,4,14,116 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 160 + tgts=144,140,65,145,16,146,155,79 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 256 + tgts=37,21,8,157,215,70,99,184 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 300 + tgts=103,262,156,253,161,119,16,201 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 3200 + tgts=3140,1761,1439,144,2874,494,1295,1255 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 +done + +## for batches 16 +for T in "bf16"; do + ## for max_target 32 + tgts=23,14,17,17,14,27,29,7,27,1,3,27,27,8,7,1 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 128 + tgts=103,86,121,21,35,17,2,27,93,31,11,108,5,86,21,51 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 160 + tgts=157,18,118,152,158,67,116,20,160,32,98,14,31,104,17,77 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 256 + tgts=239,156,221,117,90,193,151,218,173,42,123,6,54,210,114,190 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 300 + tgts=107,34,279,276,189,97,34,192,242,260,34,132,277,193,18,8 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 3200 + tgts=3013,2059,909,791,1346,2657,3012,1043,511,414,1284,2037,1802,2816,2009,760 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 +done + +## for batches 32 +for T in "bf16"; do + ## for max_target 32 + tgts=7,19,3,11,15,26,26,21,10,4,16,6,14,17,26,19,11,31,16,12,6,1,28,11,18,13,11,2,26,13,31,6 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 128 + tgts=27,77,78,74,44,52,79,5,61,108,72,50,85,19,10,108,103,79,69,37,81,51,70,113,39,33,123,91,33,109,70,40 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 160 + tgts=57,135,96,34,104,112,52,156,67,13,82,20,127,37,30,93,48,133,2,23,44,141,106,16,138,62,138,34,139,41,52,120 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 256 + tgts=63,211,77,104,202,134,227,156,125,72,29,173,239,197,210,240,147,82,101,209,56,187,181,172,195,165,231,46,178,201,125,78 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 300 + tgts=213,290,136,222,173,57,175,244,100,6,152,254,132,118,200,219,63,110,37,197,61,130,271,214,228,4,131,120,151,95,45,248 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 3200 + tgts=2870,662,3012,1221,2143,1345,2254,1296,659,3003,490,2942,488,889,2232,1034,927,2462,716,987,1030,2410,2746,2283,255,2375,2308,1334,1183,1054,2088,1837 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 +done + +set +x diff --git a/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh b/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh new file mode 100755 index 0000000000..c3d5849946 --- /dev/null +++ b/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh @@ -0,0 +1,238 @@ +#!/bin/bash + +BUILD=build +EXE="$BUILD/bin/tile_example_hstu_attention" + +set -x + +## the following cases are generated by using sparsity = 0.95 + +## for batches 4 +for T in "bf16"; do + ## for max_target 32 + tgts=13,20,3,15 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 128 + tgts=87,4,117,115 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 160 + tgts=98,11,127,41 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 256 + tgts=230,64,105,232 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 300 + tgts=49,127,151,227 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 3200 + tgts=1487,286,1582,1691 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 +done + +## for batches 8 +for T in "bf16"; do + ## for max_target 32 + tgts=12,16,24,30,28,6,6,5 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 128 + tgts=22,11,86,32,14,4,14,116 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 160 + tgts=144,140,65,145,16,146,155,79 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 256 + tgts=37,21,8,157,215,70,99,184 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 300 + tgts=103,262,156,253,161,119,16,201 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 3200 + tgts=3140,1761,1439,144,2874,494,1295,1255 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 +done + +## for batches 16 +for T in "bf16"; do + ## for max_target 32 + tgts=23,14,17,17,14,27,29,7,27,1,3,27,27,8,7,1 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 128 + tgts=103,86,121,21,35,17,2,27,93,31,11,108,5,86,21,51 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 160 + tgts=157,18,118,152,158,67,116,20,160,32,98,14,31,104,17,77 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 256 + tgts=239,156,221,117,90,193,151,218,173,42,123,6,54,210,114,190 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 300 + tgts=107,34,279,276,189,97,34,192,242,260,34,132,277,193,18,8 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 3200 + tgts=3013,2059,909,791,1346,2657,3012,1043,511,414,1284,2037,1802,2816,2009,760 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 +done + +## for batches 32 +for T in "bf16"; do + ## for max_target 32 + tgts=7,19,3,11,15,26,26,21,10,4,16,6,14,17,26,19,11,31,16,12,6,1,28,11,18,13,11,2,26,13,31,6 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 128 + tgts=27,77,78,74,44,52,79,5,61,108,72,50,85,19,10,108,103,79,69,37,81,51,70,113,39,33,123,91,33,109,70,40 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 160 + tgts=57,135,96,34,104,112,52,156,67,13,82,20,127,37,30,93,48,133,2,23,44,141,106,16,138,62,138,34,139,41,52,120 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 256 + tgts=63,211,77,104,202,134,227,156,125,72,29,173,239,197,210,240,147,82,101,209,56,187,181,172,195,165,231,46,178,201,125,78 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 300 + tgts=213,290,136,222,173,57,175,244,100,6,152,254,132,118,200,219,63,110,37,197,61,130,271,214,228,4,131,120,151,95,45,248 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + + ## for max_target 3200 + tgts=2870,662,3012,1221,2143,1345,2254,1296,659,3003,490,2942,488,889,2232,1034,927,2462,716,987,1030,2410,2746,2283,255,2375,2308,1334,1183,1054,2088,1837 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 +done + +set +x