[FMHA FWD] gfx950 Accuracy enhancement & bug fix (#2900)

* disable cast_tile_pk_fp16_fp32 on gfx950 * fix wrong encoding when hdim is not exponentiation of 2 --------- Co-authored-by: asleepzzz <hanwen.chang@amd.com> [ROCm/composable_kernel commit: 959df2a155]
2026-07-19 02:01:01 +00:00 · 2025-09-24 00:59:41 +08:00
parent 0eede5af24
commit add2107be0
2 changed files with 4 additions and 3 deletions
--- a/include/ck_tile/core/tensor/tile_elementwise.hpp
+++ b/include/ck_tile/core/tensor/tile_elementwise.hpp
@@ -231,7 +231,7 @@ CK_TILE_DEVICE auto cast_tile_pk_fp8_fp32(const InTensor& in_dstr_tensors)
 template <typename OutDataType, typename InTensor>
 CK_TILE_DEVICE auto cast_tile_pk_fp16_fp32(const InTensor& in_dstr_tensors)
 {
-#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx94__)
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
    // This API is designed to use the _pk_ serious of function
    constexpr auto in_tile_dstr = InTensor::get_tile_distribution();

--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -813,7 +813,8 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
                constexpr index_t N1_m = kNPack / N2;
                constexpr index_t N0_m = kNPerBlock / kNPack;
                constexpr index_t K1   = get_warp_size() / N1_m;
-                constexpr index_t K2_m = kKPerBlock / K1;
+                constexpr index_t K2_m = kKPerBlock / K1 / K0;
+
                return make_static_tile_distribution(
                    tile_distribution_encoding<
                        sequence<1>,
@@ -903,7 +904,7 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
            constexpr index_t N1_m = kNPack / N2;
            constexpr index_t N0_m = kNPerBlock / kNPack;
            constexpr index_t K1   = get_warp_size() / N1_m;
-            constexpr index_t K2_m = kKPerBlock / K1;
+            constexpr index_t K2_m = kKPerBlock / K1 / K0;
            return make_static_tile_distribution(
                tile_distribution_encoding<sequence<1>,
                                           tuple<sequence<N0_m, N1_m, N2>, sequence<K0, K1, K2_m>>,