From 4c98535456c468cbd36d39de4a92406fa3a012b6 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 3 Oct 2025 07:08:49 -0700 Subject: [PATCH 1/3] fix compilation errors on RHEL8 and SLES15 (#2967) --- .../gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp index df51a2aa27..4c54ec85c1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp @@ -196,7 +196,7 @@ struct DeviceGemm_Wmma_CShuffleV3R1 : public DeviceGemmV2R1>; + using DLayout = remove_cvref_t>; if constexpr(is_same::value) return Number{}; else @@ -253,7 +253,7 @@ struct DeviceGemm_Wmma_CShuffleV3R1 : public DeviceGemmV2R1{}([&](auto i) { DsLengths[i] = out_lengths; - using DLayout = ::std::__remove_cvref_t>; + using DLayout = remove_cvref_t>; if constexpr(is_same::value) { DsStrides[i] = {arg.StrideDs[i], 1}; From b4a4aa2b64a7a94ab04126545a3dc4f6d3eba847 Mon Sep 17 00:00:00 2001 From: Thomas Ning Date: Fri, 3 Oct 2025 09:46:13 -0700 Subject: [PATCH 2/3] [CK Tile] CShuffle Tile Permute N all warp compatible (#2966) * solve the hard_code issue of kM2 * clang format --- .../ops/epilogue/cshuffle_epilogue.hpp | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index e0a39a5aea..5918ec806b 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -433,8 +433,13 @@ struct CShuffleEpilogue const ScaleM& scale_m = {}, const ScaleN& scale_n = {}) { + static constexpr int RowsPerLane = CWarpTensor::get_thread_buffer_size(); + + static_assert(MPerXdl % RowsPerLane == 0, + "CShuffle (permuteN): MPerXdl must be divisible by per-lane row count."); + constexpr int kM0 = MWave; - constexpr int kM2 = 4; + constexpr int kM2 = RowsPerLane; constexpr int kM1 = MPerXdl / kM2; constexpr int kN0 = NWave; @@ -515,32 +520,25 @@ struct CShuffleEpilogue // Pack 4 “rows per lane” as you already do static_for<0, NRepeat, 1>{}([&](auto n_idx) { // source indices in shuffle_acc: (n_idx * product(Y) + row) - const index_t base = n_idx * c_warp_y_lengths.product(); + const index_t plane = c_warp_y_lengths.product(); // local lambda to fuse scale (if present) and convert - auto emit = [&](index_t out_idx, index_t src_row) { - AccDataType v = shuffle_acc.get_thread_buffer()[base + src_row]; - + static_for<0, kM2, 1>{}([&](auto m_lane) { + const int src = n_idx * plane + m_lane; // source row in this N-plane + const int dst = n_idx + m_lane * NRepeat; // permuted N layout in output + AccDataType v = shuffle_acc.get_thread_buffer()[src]; if constexpr(has_scalar_scales) { v = static_cast(v * scale_m * scale_n); } else if constexpr(has_scales && !has_scalar_scales) { - // same linear index mapping on the permuted distribution - const auto s_m = static_cast(sm_tile.get_thread_buffer()[out_idx]); - const auto s_n = static_cast(sn_tile.get_thread_buffer()[out_idx]); - v = static_cast(v * s_m * s_n); + const auto sm = static_cast(sm_tile.get_thread_buffer()[dst]); + const auto sn = static_cast(sn_tile.get_thread_buffer()[dst]); + v = static_cast(v * sm * sn); } - - c_out_tensor.get_thread_buffer()[out_idx] = type_convert(v); - }; - - // Your current packing pattern (rows 0..3, spaced by NRepeat) - emit(n_idx + 0 * NRepeat, 0); - emit(n_idx + 1 * NRepeat, 1); - emit(n_idx + 2 * NRepeat, 2); - emit(n_idx + 3 * NRepeat, 3); + c_out_tensor.get_thread_buffer()[dst] = type_convert(v); + }); }); // store/update From 58983a323287d41dff8b37c5318942d7159559dc Mon Sep 17 00:00:00 2001 From: Geo Min Date: Fri, 3 Oct 2025 12:50:16 -0700 Subject: [PATCH 3/3] [TheRock CI] Bumping hash for TheRock (#2972) * Adding new hash for TheRock * Removing package --- .github/workflows/therock-ci-linux.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index 695fb1d913..25b345880b 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -41,7 +41,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: 409f43ad9d564454bb1b23f8c8aa15d6b9d25200 + ref: 3f62012a748df3a3099c51fa95d104db643a4588 # 10-03-2025 commit path: "TheRock" - name: Runner Health Settings @@ -54,6 +54,7 @@ jobs: - name: Patch rocm-libraries run: | + rm ./TheRock/patches/amd-mainline/rocm-libraries/0009-Use-workgroupMappingDim-in-rocroller_host.patch git config --global --add safe.directory '*' git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch