From 67f9461b4290648fb3b3a69a2cc1ff59aacf7ea2 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Fri, 17 Apr 2026 09:38:41 +0000 Subject: [PATCH] Simplification in the cross_attention testing/benchmarking scripts --- .../bench_cross_attention_with_sparsity.sh | 242 +++++++++--------- .../test_cross_attention_with_sparsity.sh | 242 +++++++++--------- 2 files changed, 242 insertions(+), 242 deletions(-) diff --git a/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh b/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh index 71ad0ee040..d49d06324a 100755 --- a/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh +++ b/example/ck_tile/18_hstu_attention/scripts/bench_cross_attention_with_sparsity.sh @@ -1,7 +1,7 @@ #!/bin/bash BUILD=build -EXE="$BUILD/bin/tile_example_hstu_attention" +EXE="$BUILD/bin/tile_example_hstu_attention -v=0 -perf=1" set -x @@ -11,192 +11,192 @@ set -x for T in "bf16"; do ## for max_target 32 tgts=13,20,3,15 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 128 tgts=87,4,117,115 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 160 tgts=98,11,127,41 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 256 tgts=230,64,105,232 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 300 tgts=49,127,151,227 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 8 for T in "bf16"; do ## for max_target 32 tgts=12,16,24,30,28,6,6,5 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 128 tgts=22,11,86,32,14,4,14,116 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 160 tgts=144,140,65,145,16,146,155,79 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 256 tgts=37,21,8,157,215,70,99,184 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 300 tgts=103,262,156,253,161,119,16,201 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 16 for T in "bf16"; do ## for max_target 32 tgts=23,14,17,17,14,27,29,7,27,1,3,27,27,8,7,1 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 128 tgts=103,86,121,21,35,17,2,27,93,31,11,108,5,86,21,51 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 160 tgts=157,18,118,152,158,67,116,20,160,32,98,14,31,104,17,77 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 256 tgts=239,156,221,117,90,193,151,218,173,42,123,6,54,210,114,190 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 300 tgts=107,34,279,276,189,97,34,192,242,260,34,132,277,193,18,8 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 32 for T in "bf16"; do ## for max_target 32 tgts=7,19,3,11,15,26,26,21,10,4,16,6,14,17,26,19,11,31,16,12,6,1,28,11,18,13,11,2,26,13,31,6 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 128 tgts=27,77,78,74,44,52,79,5,61,108,72,50,85,19,10,108,103,79,69,37,81,51,70,113,39,33,123,91,33,109,70,40 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 160 tgts=57,135,96,34,104,112,52,156,67,13,82,20,127,37,30,93,48,133,2,23,44,141,106,16,138,62,138,34,139,41,52,120 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 256 tgts=63,211,77,104,202,134,227,156,125,72,29,173,239,197,210,240,147,82,101,209,56,187,181,172,195,165,231,46,178,201,125,78 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 300 tgts=213,290,136,222,173,57,175,244,100,6,152,254,132,118,200,219,63,110,37,197,61,130,271,214,228,4,131,120,151,95,45,248 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -perf=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done set +x diff --git a/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh b/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh index 2bfc0302c7..06058f4921 100755 --- a/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh +++ b/example/ck_tile/18_hstu_attention/scripts/test_cross_attention_with_sparsity.sh @@ -1,7 +1,7 @@ #!/bin/bash BUILD=build -EXE="$BUILD/bin/tile_example_hstu_attention" +EXE="$BUILD/bin/tile_example_hstu_attention -v=1" set -x @@ -11,192 +11,192 @@ set -x for T in "bf16"; do ## for max_target 32 tgts=13,20,3,15 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 128 tgts=87,4,117,115 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 160 tgts=98,11,127,41 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 256 tgts=230,64,105,232 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 300 tgts=49,127,151,227 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=4 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 8 for T in "bf16"; do ## for max_target 32 tgts=12,16,24,30,28,6,6,5 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 128 tgts=22,11,86,32,14,4,14,116 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 160 tgts=144,140,65,145,16,146,155,79 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 256 tgts=37,21,8,157,215,70,99,184 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 300 tgts=103,262,156,253,161,119,16,201 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=8 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 16 for T in "bf16"; do ## for max_target 32 tgts=23,14,17,17,14,27,29,7,27,1,3,27,27,8,7,1 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 128 tgts=103,86,121,21,35,17,2,27,93,31,11,108,5,86,21,51 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 160 tgts=157,18,118,152,158,67,116,20,160,32,98,14,31,104,17,77 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 256 tgts=239,156,221,117,90,193,151,218,173,42,123,6,54,210,114,190 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 300 tgts=107,34,279,276,189,97,34,192,242,260,34,132,277,193,18,8 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=16 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done ## for batches 32 for T in "bf16"; do ## for max_target 32 tgts=7,19,3,11,15,26,26,21,10,4,16,6,14,17,26,19,11,31,16,12,6,1,28,11,18,13,11,2,26,13,31,6 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 128 tgts=27,77,78,74,44,52,79,5,61,108,72,50,85,19,10,108,103,79,69,37,81,51,70,113,39,33,123,91,33,109,70,40 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 160 tgts=57,135,96,34,104,112,52,156,67,13,82,20,127,37,30,93,48,133,2,23,44,141,106,16,138,62,138,34,139,41,52,120 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 256 tgts=63,211,77,104,202,134,227,156,125,72,29,173,239,197,210,240,147,82,101,209,56,187,181,172,195,165,231,46,178,201,125,78 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 ## for max_target 300 tgts=213,290,136,222,173,57,175,244,100,6,152,254,132,118,200,219,63,110,37,197,61,130,271,214,228,4,131,120,151,95,45,248 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 - $EXE -v=1 -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=1022 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=2044 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=4088 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=6132 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=8176 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 + $EXE -prec=$T -b=32 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlens=128 -seqlens_kv=16352 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=$tgts -attn_scale=0 -norm_dist=0 done set +x