diff --git a/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh b/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh index e9eba8681e..71ad0ee040 100755 --- a/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh +++ b/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh @@ -53,15 +53,6 @@ for T in "bf16"; do $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - - ## for max_target 3200 - tgts=1487,286,1582,1691 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 8 @@ -110,15 +101,6 @@ for T in "bf16"; do $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - - ## for max_target 3200 - tgts=3140,1761,1439,144,2874,494,1295,1255 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 16 @@ -167,15 +149,6 @@ for T in "bf16"; do $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - - ## for max_target 3200 - tgts=3013,2059,909,791,1346,2657,3012,1043,511,414,1284,2037,1802,2816,2009,760 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 32 @@ -224,15 +197,6 @@ for T in "bf16"; do $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - - ## for max_target 3200 - tgts=2870,662,3012,1221,2143,1345,2254,1296,659,3003,490,2942,488,889,2232,1034,927,2462,716,987,1030,2410,2746,2283,255,2375,2308,1334,1183,1054,2088,1837 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done set +x diff --git a/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh b/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh index c3d5849946..2bfc0302c7 100755 --- a/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh +++ b/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh @@ -53,15 +53,6 @@ for T in "bf16"; do $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - - ## for max_target 3200 - tgts=1487,286,1582,1691 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 8 @@ -110,15 +101,6 @@ for T in "bf16"; do $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - - ## for max_target 3200 - tgts=3140,1761,1439,144,2874,494,1295,1255 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 16 @@ -167,15 +149,6 @@ for T in "bf16"; do $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - - ## for max_target 3200 - tgts=3013,2059,909,791,1346,2657,3012,1043,511,414,1284,2037,1802,2816,2009,760 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 32 @@ -224,15 +197,6 @@ for T in "bf16"; do $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - - ## for max_target 3200 - tgts=2870,662,3012,1221,2143,1345,2254,1296,659,3003,490,2942,488,889,2232,1034,927,2462,716,987,1030,2410,2746,2283,255,2375,2308,1334,1183,1054,2088,1837 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done set +x