From add2107be0b36f7adddd217bb9f859b7305dd259 Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Wed, 24 Sep 2025 00:59:41 +0800 Subject: [PATCH] [FMHA FWD] gfx950 Accuracy enhancement & bug fix (#2900) * disable cast_tile_pk_fp16_fp32 on gfx950 * fix wrong encoding when hdim is not exponentiation of 2 --------- Co-authored-by: asleepzzz [ROCm/composable_kernel commit: 959df2a15563155329f1d77b2151c3744ff2d749] --- include/ck_tile/core/tensor/tile_elementwise.hpp | 2 +- .../pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/ck_tile/core/tensor/tile_elementwise.hpp b/include/ck_tile/core/tensor/tile_elementwise.hpp index 284efd5d70..d29afa2d98 100644 --- a/include/ck_tile/core/tensor/tile_elementwise.hpp +++ b/include/ck_tile/core/tensor/tile_elementwise.hpp @@ -231,7 +231,7 @@ CK_TILE_DEVICE auto cast_tile_pk_fp8_fp32(const InTensor& in_dstr_tensors) template CK_TILE_DEVICE auto cast_tile_pk_fp16_fp32(const InTensor& in_dstr_tensors) { -#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__) +#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) // This API is designed to use the _pk_ serious of function constexpr auto in_tile_dstr = InTensor::get_tile_distribution(); diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp index ff1f31edc8..dccb41ba44 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp @@ -813,7 +813,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy, @@ -903,7 +904,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy, tuple, sequence>,