From 86c0e45987574034dd71d612a2cd80e039b5eca5 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Wed, 9 Apr 2025 08:28:05 +0000 Subject: [PATCH] Add benchmark_hstu_attention.sh --- .../benchmark_hstu_attention.sh | 16 ++++++++++++++++ .../18_hstu_attention/example_hstu_attention.cpp | 10 +++++----- .../18_hstu_attention/test_hstu_attention.sh | 15 +++++++++------ 3 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 example/ck_tile/18_hstu_attention/benchmark_hstu_attention.sh diff --git a/example/ck_tile/18_hstu_attention/benchmark_hstu_attention.sh b/example/ck_tile/18_hstu_attention/benchmark_hstu_attention.sh new file mode 100644 index 0000000000..0c2c10af50 --- /dev/null +++ b/example/ck_tile/18_hstu_attention/benchmark_hstu_attention.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +BUILD=build +EXE=$BUILD/bin/tile_example_hstu_attention + +for dtype in "fp16" "bf16"; do + ## jagged is true + cmd="$EXE -v=0 -prec=$dtype -b=80 -jagged=1 -nhead=8 -hdim_qk=128 -hdim_v=128 -seqlen=1000 -causal=1 -local_len=5 -context_len=8 -minfull_len=7 -targets=8 -perf=1" + echo $cmd + $EXE -v=0 -prec=$dtype -b=80 -jagged=1 -nhead=8 -hdim_qk=128 -hdim_v=128 -seqlen=1000 -causal=1 -local_len=5 -context_len=8 -minfull_len=7 -targets=8 -perf=1 + + ## jagged is false + cmd="$EXE -v=0 -prec=$dtype -b=80 -jagged=0 -nhead=8 -hdim_qk=128 -hdim_v=128 -seqlen=1000 -causal=1 -local_len=5 -context_len=8 -minfull_len=7 -targets=8 -perf=1" + echo $cmd + $EXE -v=0 -prec=$dtype -b=80 -jagged=0 -nhead=8 -hdim_qk=128 -hdim_v=128 -seqlen=1000 -causal=1 -local_len=5 -context_len=8 -minfull_len=7 -targets=8 -perf=1 +done diff --git a/example/ck_tile/18_hstu_attention/example_hstu_attention.cpp b/example/ck_tile/18_hstu_attention/example_hstu_attention.cpp index e04cf76a40..4af9e72e23 100644 --- a/example/ck_tile/18_hstu_attention/example_hstu_attention.cpp +++ b/example/ck_tile/18_hstu_attention/example_hstu_attention.cpp @@ -440,8 +440,8 @@ bool run(const ck_tile::ArgParser& arg_parser) o_dev.FromDevice(o_host.data()); - dumpBufferToFile("output_dev.dat", o_host.data(), o_host.get_element_space_size()); - dumpBufferToFile("output_host.dat", o_host_ref.data(), o_host.get_element_space_size()); + // dumpBufferToFile("output_dev.dat", o_host.data(), o_host.get_element_space_size()); + // dumpBufferToFile("output_host.dat", o_host_ref.data(), o_host.get_element_space_size()); auto [rtol, atol] = get_elimit(); @@ -454,7 +454,7 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::gpu_timer timer{}; timer.start(stream); - for(int i = 0; i < 20; i++) + for(int i = 0; i < 10; i++) { if constexpr(std::is_same::value) { @@ -473,9 +473,9 @@ bool run(const ck_tile::ArgParser& arg_parser) } timer.stop(stream); - auto ms = timer.duration() / 20.f; + auto ms = timer.duration() / 10.f; - std::cout << "Average execution time of the hstu_attention operator is " << ms + std::cout << "Average execution time of the hstu_attention operation is " << ms << " milli-seconds" << std::endl; } diff --git a/example/ck_tile/18_hstu_attention/test_hstu_attention.sh b/example/ck_tile/18_hstu_attention/test_hstu_attention.sh index a52c2aa11a..1b42f60efc 100644 --- a/example/ck_tile/18_hstu_attention/test_hstu_attention.sh +++ b/example/ck_tile/18_hstu_attention/test_hstu_attention.sh @@ -1,20 +1,23 @@ #!/bin/bash +BUILD=build +EXE=$BUILD/bin/tile_example_hstu_attention + ## no masking batched -bin/tile_example_hstu_attention -v=1 -prec=fp16 -b=10 -jagged=0 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=256 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=0 +$EXE -v=1 -prec=fp16 -b=10 -jagged=0 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=256 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=0 ## no masking jagged -bin/tile_example_hstu_attention -v=1 -prec=fp16 -b=10 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=300,300,290,280,310 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=0 +$EXE -v=1 -prec=fp16 -b=10 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=300,300,290,280,310 -causal=0 -local_len=0 -context_len=0 -minfull_len=0 -targets=0 ## batched causal -bin/tile_example_hstu_attention -v=1 -prec=fp16 -b=10 -jagged=0 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=256 -causal=1 -local_len=0 -context_len=0 -minfull_len=0 -targets=0 +$EXE -v=1 -prec=fp16 -b=10 -jagged=0 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=256 -causal=1 -local_len=0 -context_len=0 -minfull_len=0 -targets=0 ## jagged causal -bin/tile_example_hstu_attention -v=1 -prec=fp16 -b=10 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=300,300,290,280,310 -causal=1 -local_len=0 -context_len=0 -minfull_len=0 -targets=0 +$EXE -v=1 -prec=fp16 -b=10 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=300,300,290,280,310 -causal=1 -local_len=0 -context_len=0 -minfull_len=0 -targets=0 ## batched causal+local -bin/tile_example_hstu_attention -v=1 -prec=fp16 -b=10 -jagged=0 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=256 -causal=1 -local_len=5 -context_len=8 -minfull_len=7 -targets=8 +$EXE -v=1 -prec=fp16 -b=10 -jagged=0 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=256 -causal=1 -local_len=5 -context_len=8 -minfull_len=7 -targets=8 ## jagged causal+local -bin/tile_example_hstu_attention -v=1 -prec=fp16 -b=10 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=300,300,290,280,310 -causal=1 -local_len=5 -context_len=8 -minfull_len=7 -targets=8 +$EXE -v=1 -prec=fp16 -b=10 -jagged=1 -nhead=4 -hdim_qk=128 -hdim_v=128 -seqlen=300,300,290,280,310 -causal=1 -local_len=5 -context_len=8 -minfull_len=7 -targets=8