From b721f79f994d620d7adc25ebcf69f4e0a10dc631 Mon Sep 17 00:00:00 2001 From: Juuso Korhonen <40278371+juuso-oskari@users.noreply.github.com> Date: Mon, 13 Oct 2025 10:30:11 +0000 Subject: [PATCH 1/2] fix --- .../unified_attention/kernel/unified_attention_kernel.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/ck_tile/ops/unified_attention/kernel/unified_attention_kernel.hpp b/include/ck_tile/ops/unified_attention/kernel/unified_attention_kernel.hpp index ea1c4f3bf0..5765fd858c 100644 --- a/include/ck_tile/ops/unified_attention/kernel/unified_attention_kernel.hpp +++ b/include/ck_tile/ops/unified_attention/kernel/unified_attention_kernel.hpp @@ -354,15 +354,13 @@ struct FmhaFwdV3Kernel q_dram_pad, make_tuple( make_merge_transform( - make_tuple(seq_len, num_queries_per_kv) + make_tuple(seq_len_padded, num_queries_per_kv) ), make_pass_through_transform(HEAD_SIZE_PADDED) ), make_tuple(sequence<0, 1>{}, sequence<2>{}), make_tuple(sequence<0>{}, sequence<1>{}) - ); - - // TODO are we padding the tensor view or the block here? + ); // flattens the first two dims, head dim is the fastest changing dim in the merged dim return q_dram_merged; }(); From 6ba25b7e8492fd5fa88e91d29a11135490af8f3d Mon Sep 17 00:00:00 2001 From: Juuso Korhonen <40278371+juuso-oskari@users.noreply.github.com> Date: Mon, 13 Oct 2025 10:34:55 +0000 Subject: [PATCH 2/2] add commenting --- .../ops/unified_attention/kernel/unified_attention_kernel.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck_tile/ops/unified_attention/kernel/unified_attention_kernel.hpp b/include/ck_tile/ops/unified_attention/kernel/unified_attention_kernel.hpp index ca71e36c3b..9955ca24b3 100644 --- a/include/ck_tile/ops/unified_attention/kernel/unified_attention_kernel.hpp +++ b/include/ck_tile/ops/unified_attention/kernel/unified_attention_kernel.hpp @@ -346,7 +346,7 @@ struct FmhaFwdV3Kernel number{}, number<1>{}); - const auto q_dram_pad = pad_tensor_view( // aling cu_seqlen with BLOCK_Q and head dim with HEAD_SIZE_PADDED + const auto q_dram_pad = pad_tensor_view( // aling seqlen with BLOCK_Q and head dim with HEAD_SIZE_PADDED q_dram_base, // block sizes make_tuple(BLOCK_Q, 1, HEAD_SIZE_PADDED), @@ -363,7 +363,7 @@ struct FmhaFwdV3Kernel ), make_tuple(sequence<0, 1>{}, sequence<2>{}), make_tuple(sequence<0>{}, sequence<1>{}) - ); // flattens the first two dims, head dim is the fastest changing dim in the merged dim + ); // flattens the first two dims, head idx is the fastest changing dim in the merged dim return q_dram_merged; }();