[CK_TILE] FMHA Support hdim_v to as a Multiple of 32 (#2114)

* 160+192

* Add splitkv d160

* cleanup

* fix

* Add change log

* Fix CHANGELOG

* Use static_cast

* Update ignored instance

---------

Co-authored-by: asleepzzz <hanwen.chang@amd.com>

[ROCm/composable_kernel commit: b8212864cf]
This commit is contained in:
Yi DING
2025-06-24 01:33:31 +08:00
committed by GitHub
parent 470608bdcb
commit cba904aeff
7 changed files with 89 additions and 68 deletions

View File

@@ -1,5 +1,5 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
@@ -129,7 +129,10 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT
// set output vectors
static_for<0, num_vec_out, 1>{}([&](auto i) {
constexpr auto idx_y_out_tmp = generate_array(
[&](auto ii) { return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii]; },
[&](auto ii) {
return ii == y_dim_vec_in ? static_cast<index_t>(idx_y_start[ii]) + i
: static_cast<index_t>(idx_y_start[ii]);
},
number<NDimY>{});
constexpr auto idx_y_out =