Add scripts for testing/benchmarking cross_attention cases

This commit is contained in:
Qianfeng Zhang
2026-04-16 15:45:57 +00:00
parent 7889844d6b
commit 5c84f54fd9
2 changed files with 476 additions and 0 deletions

View File

@@ -0,0 +1,238 @@
#!/bin/bash
BUILD=build
EXE="$BUILD/bin/tile_example_hstu_attention"
set -x
## the following cases are generated by using sparsity = 0.95
## for batches 4
for T in "bf16"; do
## for max_target 32
tgts=13,20,3,15
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 128
tgts=87,4,117,115
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 160
tgts=98,11,127,41
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 256
tgts=230,64,105,232
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 300
tgts=49,127,151,227
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 3200
tgts=1487,286,1582,1691
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
done
## for batches 8
for T in "bf16"; do
## for max_target 32
tgts=12,16,24,30,28,6,6,5
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 128
tgts=22,11,86,32,14,4,14,116
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 160
tgts=144,140,65,145,16,146,155,79
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 256
tgts=37,21,8,157,215,70,99,184
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 300
tgts=103,262,156,253,161,119,16,201
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 3200
tgts=3140,1761,1439,144,2874,494,1295,1255
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
done
## for batches 16
for T in "bf16"; do
## for max_target 32
tgts=23,14,17,17,14,27,29,7,27,1,3,27,27,8,7,1
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 128
tgts=103,86,121,21,35,17,2,27,93,31,11,108,5,86,21,51
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 160
tgts=157,18,118,152,158,67,116,20,160,32,98,14,31,104,17,77
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 256
tgts=239,156,221,117,90,193,151,218,173,42,123,6,54,210,114,190
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 300
tgts=107,34,279,276,189,97,34,192,242,260,34,132,277,193,18,8
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 3200
tgts=3013,2059,909,791,1346,2657,3012,1043,511,414,1284,2037,1802,2816,2009,760
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
done
## for batches 32
for T in "bf16"; do
## for max_target 32
tgts=7,19,3,11,15,26,26,21,10,4,16,6,14,17,26,19,11,31,16,12,6,1,28,11,18,13,11,2,26,13,31,6
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 128
tgts=27,77,78,74,44,52,79,5,61,108,72,50,85,19,10,108,103,79,69,37,81,51,70,113,39,33,123,91,33,109,70,40
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 160
tgts=57,135,96,34,104,112,52,156,67,13,82,20,127,37,30,93,48,133,2,23,44,141,106,16,138,62,138,34,139,41,52,120
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 256
tgts=63,211,77,104,202,134,227,156,125,72,29,173,239,197,210,240,147,82,101,209,56,187,181,172,195,165,231,46,178,201,125,78
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 300
tgts=213,290,136,222,173,57,175,244,100,6,152,254,132,118,200,219,63,110,37,197,61,130,271,214,228,4,131,120,151,95,45,248
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 3200
tgts=2870,662,3012,1221,2143,1345,2254,1296,659,3003,490,2942,488,889,2232,1034,927,2462,716,987,1030,2410,2746,2283,255,2375,2308,1334,1183,1054,2088,1837
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
done
set +x

View File

@@ -0,0 +1,238 @@
#!/bin/bash
BUILD=build
EXE="$BUILD/bin/tile_example_hstu_attention"
set -x
## the following cases are generated by using sparsity = 0.95
## for batches 4
for T in "bf16"; do
## for max_target 32
tgts=13,20,3,15
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 128
tgts=87,4,117,115
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 160
tgts=98,11,127,41
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 256
tgts=230,64,105,232
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 300
tgts=49,127,151,227
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 3200
tgts=1487,286,1582,1691
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
done
## for batches 8
for T in "bf16"; do
## for max_target 32
tgts=12,16,24,30,28,6,6,5
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 128
tgts=22,11,86,32,14,4,14,116
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 160
tgts=144,140,65,145,16,146,155,79
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 256
tgts=37,21,8,157,215,70,99,184
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 300
tgts=103,262,156,253,161,119,16,201
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 3200
tgts=3140,1761,1439,144,2874,494,1295,1255
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
done
## for batches 16
for T in "bf16"; do
## for max_target 32
tgts=23,14,17,17,14,27,29,7,27,1,3,27,27,8,7,1
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 128
tgts=103,86,121,21,35,17,2,27,93,31,11,108,5,86,21,51
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 160
tgts=157,18,118,152,158,67,116,20,160,32,98,14,31,104,17,77
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 256
tgts=239,156,221,117,90,193,151,218,173,42,123,6,54,210,114,190
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 300
tgts=107,34,279,276,189,97,34,192,242,260,34,132,277,193,18,8
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 3200
tgts=3013,2059,909,791,1346,2657,3012,1043,511,414,1284,2037,1802,2816,2009,760
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
done
## for batches 32
for T in "bf16"; do
## for max_target 32
tgts=7,19,3,11,15,26,26,21,10,4,16,6,14,17,26,19,11,31,16,12,6,1,28,11,18,13,11,2,26,13,31,6
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 128
tgts=27,77,78,74,44,52,79,5,61,108,72,50,85,19,10,108,103,79,69,37,81,51,70,113,39,33,123,91,33,109,70,40
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 160
tgts=57,135,96,34,104,112,52,156,67,13,82,20,127,37,30,93,48,133,2,23,44,141,106,16,138,62,138,34,139,41,52,120
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 256
tgts=63,211,77,104,202,134,227,156,125,72,29,173,239,197,210,240,147,82,101,209,56,187,181,172,195,165,231,46,178,201,125,78
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 300
tgts=213,290,136,222,173,57,175,244,100,6,152,254,132,118,200,219,63,110,37,197,61,130,271,214,228,4,131,120,151,95,45,248
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
## for max_target 3200
tgts=2870,662,3012,1221,2143,1345,2254,1296,659,3003,490,2942,488,889,2232,1034,927,2462,716,987,1030,2410,2746,2283,255,2375,2308,1334,1183,1054,2088,1837
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
$EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0
done
set +x