diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index 695fb1d913..25b345880b 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -41,7 +41,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: 409f43ad9d564454bb1b23f8c8aa15d6b9d25200 + ref: 3f62012a748df3a3099c51fa95d104db643a4588 # 10-03-2025 commit path: "TheRock" - name: Runner Health Settings @@ -54,6 +54,7 @@ jobs: - name: Patch rocm-libraries run: | + rm ./TheRock/patches/amd-mainline/rocm-libraries/0009-Use-workgroupMappingDim-in-rocroller_host.patch git config --global --add safe.directory '*' git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp index df51a2aa27..4c54ec85c1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3r1.hpp @@ -196,7 +196,7 @@ struct DeviceGemm_Wmma_CShuffleV3R1 : public DeviceGemmV2R1>; + using DLayout = remove_cvref_t>; if constexpr(is_same::value) return Number{}; else @@ -253,7 +253,7 @@ struct DeviceGemm_Wmma_CShuffleV3R1 : public DeviceGemmV2R1{}([&](auto i) { DsLengths[i] = out_lengths; - using DLayout = ::std::__remove_cvref_t>; + using DLayout = remove_cvref_t>; if constexpr(is_same::value) { DsStrides[i] = {arg.StrideDs[i], 1}; diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index e0a39a5aea..5918ec806b 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -433,8 +433,13 @@ struct CShuffleEpilogue const ScaleM& scale_m = {}, const ScaleN& scale_n = {}) { + static constexpr int RowsPerLane = CWarpTensor::get_thread_buffer_size(); + + static_assert(MPerXdl % RowsPerLane == 0, + "CShuffle (permuteN): MPerXdl must be divisible by per-lane row count."); + constexpr int kM0 = MWave; - constexpr int kM2 = 4; + constexpr int kM2 = RowsPerLane; constexpr int kM1 = MPerXdl / kM2; constexpr int kN0 = NWave; @@ -515,32 +520,25 @@ struct CShuffleEpilogue // Pack 4 “rows per lane” as you already do static_for<0, NRepeat, 1>{}([&](auto n_idx) { // source indices in shuffle_acc: (n_idx * product(Y) + row) - const index_t base = n_idx * c_warp_y_lengths.product(); + const index_t plane = c_warp_y_lengths.product(); // local lambda to fuse scale (if present) and convert - auto emit = [&](index_t out_idx, index_t src_row) { - AccDataType v = shuffle_acc.get_thread_buffer()[base + src_row]; - + static_for<0, kM2, 1>{}([&](auto m_lane) { + const int src = n_idx * plane + m_lane; // source row in this N-plane + const int dst = n_idx + m_lane * NRepeat; // permuted N layout in output + AccDataType v = shuffle_acc.get_thread_buffer()[src]; if constexpr(has_scalar_scales) { v = static_cast(v * scale_m * scale_n); } else if constexpr(has_scales && !has_scalar_scales) { - // same linear index mapping on the permuted distribution - const auto s_m = static_cast(sm_tile.get_thread_buffer()[out_idx]); - const auto s_n = static_cast(sn_tile.get_thread_buffer()[out_idx]); - v = static_cast(v * s_m * s_n); + const auto sm = static_cast(sm_tile.get_thread_buffer()[dst]); + const auto sn = static_cast(sn_tile.get_thread_buffer()[dst]); + v = static_cast(v * sm * sn); } - - c_out_tensor.get_thread_buffer()[out_idx] = type_convert(v); - }; - - // Your current packing pattern (rows 0..3, spaced by NRepeat) - emit(n_idx + 0 * NRepeat, 0); - emit(n_idx + 1 * NRepeat, 1); - emit(n_idx + 2 * NRepeat, 2); - emit(n_idx + 3 * NRepeat, 3); + c_out_tensor.get_thread_buffer()[dst] = type_convert(v); + }); }); // store/update