From 8b37a2968f43ff850ead2a277e680e3388130126 Mon Sep 17 00:00:00 2001 From: jefyang1 <146495389+jefyang1@users.noreply.github.com> Date: Tue, 19 Aug 2025 09:58:28 -0700 Subject: [PATCH] Fix pk i4 v3 example test regression on gfx942 (#2706) Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> [ROCm/composable_kernel commit: 6ba9289b26b5df0960e0d314f2ade988f88ea35e] --- .../gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp index dc8e98218e..57adcd4f6d 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp @@ -44,10 +44,11 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) is_gfx950_build = false, #endif }; - // skip building the instances with K1>=32 on pre-gfx950 - if constexpr(((GridwiseGemm::AK1Number >= 32 || GridwiseGemm::BK1Number >= 32) && - static_cast(Arch::is_gfx950_build)) || - (GridwiseGemm::AK1Number < 32 && GridwiseGemm::BK1Number < 32)) + // skip building the instances with K1>=32 && PackedSize != 2 on pre-gfx950 + if constexpr(static_cast(Arch::is_gfx950_build) || + (GridwiseGemm::AK1Number < 32 && GridwiseGemm::BK1Number < 32) || + (GridwiseGemm::AK1Number >= 32 && GridwiseGemm::APackedSize == 2) || + (GridwiseGemm::BK1Number >= 32 && GridwiseGemm::BPackedSize == 2)) { __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; @@ -86,10 +87,11 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy) is_gfx950_build = false, #endif }; - // skip building the instances with K1>=32 on pre-gfx950 - if constexpr(((GridwiseGemm::AK1Number >= 32 || GridwiseGemm::BK1Number >= 32) && - static_cast(Arch::is_gfx950_build)) || - (GridwiseGemm::AK1Number < 32 && GridwiseGemm::BK1Number < 32)) + // skip building the instances with K1>=32 && PackedSize != 2 on pre-gfx950 + if constexpr(static_cast(Arch::is_gfx950_build) || + (GridwiseGemm::AK1Number < 32 && GridwiseGemm::BK1Number < 32) || + (GridwiseGemm::AK1Number >= 32 && GridwiseGemm::APackedSize == 2) || + (GridwiseGemm::BK1Number >= 32 && GridwiseGemm::BPackedSize == 2)) { // Pass two lds pointer is the key to tell compiler that ds_read/write // operate on different lds chunk at same time without order dependecy