From 09019c10245dcfc384d49fec06357c86d605a48f Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Fri, 19 Dec 2025 23:13:41 +0000 Subject: [PATCH] Merge commit 'cbc83359649b1b56cd745c4102e9556112f942c2' into develop --- .github/workflows/therock-ci-linux.yml | 2 +- .github/workflows/therock-ci.yml | 29 +---- .github/workflows/therock-test-component.yml | 2 +- .github/workflows/therock-test-packages.yml | 2 +- CMakeLists.txt | 16 +++ ...est_ckb_conv_fwd_2d_bf16_scaleadd_relu.cpp | 2 +- .../conv/ck/test_ckb_conv_fwd_3d_fp16.cpp | 2 +- .../builder/test/test_conv_description.cpp | 6 +- .../test/utils/ckb_conv_test_configs.hpp | 2 +- .../gpu/device/device_base.hpp | 56 ++++++--- .../device_batched_gemm_gemm_xdl_cshuffle.hpp | 4 +- ...ultiple_d_gemm_multiple_d_xdl_cshuffle.hpp | 28 ++--- ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 4 +- ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp | 4 +- ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 114 ++++++++++-------- ..._conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp | 60 ++++++--- ...d_multiple_d_xdl_large_tensor_cshuffle.hpp | 73 +++++++---- ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 4 +- include/ck_tile/core/config.hpp | 2 +- 19 files changed, 249 insertions(+), 163 deletions(-) diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index b8977f702f..0baa503334 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -54,7 +54,7 @@ jobs: with: repository: "ROCm/TheRock" path: "TheRock" - ref: bfcaf6e0bcd4bfe3c21990f49bbccb7d2a087d5d # 2025-12-15 commit + ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit - name: Setup ccache run: | diff --git a/.github/workflows/therock-ci.yml b/.github/workflows/therock-ci.yml index 2055e6643a..0951244f31 100644 --- a/.github/workflows/therock-ci.yml +++ b/.github/workflows/therock-ci.yml @@ -35,44 +35,21 @@ jobs: BASE_REF: HEAD^ outputs: enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }} - linux_package_targets: ${{ steps.configure_linux.outputs.package_targets }} steps: - name: "Checking out repository" uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: # We need the parent commit to do a diff fetch-depth: 2 - - - name: Checkout TheRock repository - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - with: - repository: "ROCm/TheRock" - path: TheRock - ref: bfcaf6e0bcd4bfe3c21990f49bbccb7d2a087d5d # 2025-12-15 commit - name: "Configuring CI options" id: configure run: python .github/scripts/therock_configure_ci.py - - name: Fetch Linux targets for build and test - env: - THEROCK_PACKAGE_PLATFORM: "linux" - # TODO(geomin12): Allow dynamic values of AMDGPU_FAMILIES, with opt-in options - AMDGPU_FAMILIES: "gfx94X" - # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' - ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} - LOAD_TEST_RUNNERS_FROM_VAR: true - id: configure_linux - run: python ./TheRock/build_tools/github_actions/fetch_package_targets.py - therock-ci-linux: - name: TheRock CI Linux (${{ matrix.target_bundle.amdgpu_family }}) + name: TheRock CI Linux needs: setup if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }} - strategy: - fail-fast: false - matrix: - target_bundle: ${{ fromJSON(needs.setup.outputs.linux_package_targets) }} permissions: contents: read id-token: write @@ -87,8 +64,8 @@ jobs: -DTHEROCK_COMPOSABLE_KERNEL_SOURCE_DIR=../composable_kernel -DTHEROCK_USE_EXTERNAL_ROCM_LIBRARIES=ON -DTHEROCK_ROCM_LIBRARIES_SOURCE_DIR=../ - amdgpu_families: ${{ matrix.target_bundle.amdgpu_family }} - test_runs_on: ${{ matrix.target_bundle.test_machine }} + amdgpu_families: "gfx94X-dcgpu" + test_runs_on: "linux-mi325-1gpu-ossci-rocm-frac" therock_ci_summary: name: TheRock CI Summary diff --git a/.github/workflows/therock-test-component.yml b/.github/workflows/therock-test-component.yml index 4debcf9813..565d1d3e54 100644 --- a/.github/workflows/therock-test-component.yml +++ b/.github/workflows/therock-test-component.yml @@ -51,7 +51,7 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: "ROCm/TheRock" - ref: bfcaf6e0bcd4bfe3c21990f49bbccb7d2a087d5d # 2025-12-15 commit + ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml index 6389a170b7..cd255a40b6 100644 --- a/.github/workflows/therock-test-packages.yml +++ b/.github/workflows/therock-test-packages.yml @@ -27,7 +27,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: bfcaf6e0bcd4bfe3c21990f49bbccb7d2a087d5d # 2025-12-15 commit + ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit - name: "Configuring CI options" env: diff --git a/CMakeLists.txt b/CMakeLists.txt index eaed7d3509..06d270c16e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,22 @@ set(version 1.2.0) project(composable_kernel VERSION ${version} LANGUAGES CXX HIP) include(CTest) +# Set the default value of LLVM_MAIN_REVISION to the equivalent of ROCm7.1.1 release +set(LLVM_MAIN_REVISION 524190) +if (CMAKE_CXX_COMPILER MATCHES "/opt/rocm") + message("compiler in default /opt/rocm/ path") + file(READ "/opt/rocm/llvm/include/llvm/Config/llvm-config.h" HEADER_CONTENT) + string(REGEX MATCH "#define LLVM_MAIN_REVISION[ \t]+([0-9]+)" MATCH_RESULT "${HEADER_CONTENT}") + set(LLVM_MAIN_REVISION ${CMAKE_MATCH_1}) +elseif (CMAKE_CXX_COMPILER MATCHES "/llvm-project/build") + message("compiler in custom /llvm-project/build/ path") + file(READ "/llvm-project/build/include/llvm/Config/llvm-config.h" HEADER_CONTENT) + string(REGEX MATCH "#define LLVM_MAIN_REVISION[ \t]+([0-9]+)" MATCH_RESULT "${HEADER_CONTENT}") + set(LLVM_MAIN_REVISION ${CMAKE_MATCH_1}) +endif() +message("From CMake: LLVM_MAIN_REVISION=${LLVM_MAIN_REVISION}") +add_definitions(-DLLVM_MAIN_REVISION=${LLVM_MAIN_REVISION}) + option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON) option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF) option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF) diff --git a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_bf16_scaleadd_relu.cpp b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_bf16_scaleadd_relu.cpp index e3dc261fe3..0d9563e05a 100644 --- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_bf16_scaleadd_relu.cpp +++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_bf16_scaleadd_relu.cpp @@ -33,7 +33,7 @@ TEST(FwdConvInstances, constexpr auto FwdConvAlgorithm = ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle{} .with_thread_block(FwdThreadBlock_64_64x32x32) - .with_gemm_config(FwdGemmParams_Xdl_2x2_per_wave) + .with_gemm_config(FwdGemmParams_Xdl_2x1_per_wave) .with_transfer(FwdTransfer_4x16x1) .with_specializations(ConvFwdSpecialization::DEFAULT, GemmSpecialization::MNKPadding) .with_prefetch_config(1, 1, PipelineScheduler::DEFAULT); diff --git a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_3d_fp16.cpp b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_3d_fp16.cpp index 11c8172533..b30f958bc4 100644 --- a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_3d_fp16.cpp +++ b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_3d_fp16.cpp @@ -28,7 +28,7 @@ TEST(FwdConvInstances, constexpr auto FwdConvAlgorithm = ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3{} .with_thread_block(FwdThreadBlock_256_128x128x32) - .with_gemm_config(FwdGemmParams_Xdl_4x4_per_wave) + .with_gemm_config(FwdGemmParams_Xdl_2x1_per_wave) .with_transfer(FwdTransfer_4x64x1) .with_specializations(ConvFwdSpecialization::FILTER_1X1_PAD0, GemmSpecialization::MNKPadding) diff --git a/experimental/builder/test/test_conv_description.cpp b/experimental/builder/test/test_conv_description.cpp index 158cb2668f..dca0e858eb 100644 --- a/experimental/builder/test/test_conv_description.cpp +++ b/experimental/builder/test/test_conv_description.cpp @@ -111,8 +111,8 @@ struct DefaultAlgorithm .bk1 = 8, .m_per_xdl = 16, .n_per_xdl = 16, - .m_xdl_per_wave = 4, - .n_xdl_per_wave = 4}; + .m_xdl_per_wave = 8, + .n_xdl_per_wave = 8}; ckb::test::TransferABC transfer{ .a = @@ -188,7 +188,7 @@ TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription) " ├─ Pipeline scheduler: INTRAWAVE\n" " ├─ Warp Gemm parameters: \n" " │ ├─ subtile size: 16×16\n" - " │ └─ Number of warp gemm iterations: 4×4\n" + " │ └─ Number of warp gemm iterations: 8×8\n" " └─ Memory access:\n" " ├─ A Tile transfer: \n" " │ ├─ Tile dimensions: 4×256×8×\n" diff --git a/experimental/builder/test/utils/ckb_conv_test_configs.hpp b/experimental/builder/test/utils/ckb_conv_test_configs.hpp index 403c2ffd79..ad5a5f4f6f 100644 --- a/experimental/builder/test/utils/ckb_conv_test_configs.hpp +++ b/experimental/builder/test/utils/ckb_conv_test_configs.hpp @@ -68,7 +68,7 @@ constexpr TransferABC FwdTransfer_4x64x1{ {.m_block = 1, .m_wave_per_xdl = 32, .n_block = 1, .n_wave_per_xdl = 8}, .epilogue = {.m_xdl_per_wave_per_shuffle = 1, .n_per_wave_per_shuffle = 1, - .scalar_per_vector = 8}, + .scalar_per_vector = 4}, }, }; diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp index 9179a279c5..361b116782 100644 --- a/include/ck/tensor_operation/gpu/device/device_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_base.hpp @@ -60,7 +60,7 @@ template -static constexpr auto GetNXdlPerWave2() +static constexpr auto GetXdlPerWave2() { constexpr index_t Waves = IsWave64 ? BlockSize_ / 64 : BlockSize_ / 32; constexpr index_t MWaves = MPerBlock_ / (MXdlPerWave_ * MPerXDL_); @@ -84,17 +84,33 @@ static constexpr auto GetNXdlPerWave2() } } -#define GET_NXDL_PER_WAVE_IMPL \ - template \ - static constexpr auto GetNXdlPerWave() \ - { \ - return GetNXdlPerWave2(); \ +#define GET_NXDL_PER_WAVE_IMPL \ + template \ + static constexpr auto GetNXdlPerWave() \ + { \ + return GetXdlPerWave2(); \ + } + +#define GET_MXDL_PER_WAVE_IMPL \ + template \ + static constexpr auto GetMXdlPerWave() \ + { \ + return GetXdlPerWave2(); \ } template () - : GetNXdlPerWave2(); + ? GetXdlPerWave2() + : GetXdlPerWave2(); if constexpr(IsWave64 == false && NXdlPerWave != 0) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp index 6089e7e63f..b930c50e3a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp @@ -190,9 +190,9 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm(); + GetXdlPerWave2(); static constexpr auto MXdlPerWave32 = - GetNXdlPerWave2(); + GetXdlPerWave2(); static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp index 1fc7c8e523..4410871ac1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp @@ -235,20 +235,20 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle { using DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle; - static constexpr auto Gemm0MXdlPerWave64 = GetNXdlPerWave2(); - static constexpr auto Gemm0MXdlPerWave32 = GetNXdlPerWave2(); + static constexpr auto Gemm0MXdlPerWave64 = GetXdlPerWave2(); + static constexpr auto Gemm0MXdlPerWave32 = GetXdlPerWave2(); static constexpr index_t NumD0Tensor = D0sDataType::Size(); static constexpr index_t NumD1Tensor = D1sDataType::Size(); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp index 9bacb3b661..9ece23985a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp @@ -223,9 +223,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle MaskingSpec> { static constexpr auto MXdlPerWave64 = - GetNXdlPerWave2(); + GetXdlPerWave2(); static constexpr auto MXdlPerWave32 = - GetNXdlPerWave2(); + GetXdlPerWave2(); static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0, "Number of dimension must be greater than 0"); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp index d6a4f49be8..35b2f54f58 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp @@ -211,9 +211,9 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle using DeviceOp = DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle; static constexpr auto MXdlPerWave64 = - GetNXdlPerWave2(); + GetXdlPerWave2(); static constexpr auto MXdlPerWave32 = - GetNXdlPerWave2(); + GetXdlPerWave2(); static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index 5ed8da8d1b..6229362a7a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -325,9 +325,15 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle BComputeDataType> { using DeviceOp = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle; - GET_NXDL_PER_WAVE_IMPL - static constexpr auto NXdlPerWave64 = GetNXdlPerWave(); - static constexpr auto NXdlPerWave32 = GetNXdlPerWave(); + GET_MXDL_PER_WAVE_IMPL + // Force usage of 16x16 instruction for WMMA + static constexpr index_t Wave32MaxMNPerXDL = 16; + static constexpr auto MXdlPerWave64 = GetMXdlPerWave(); + static constexpr auto MXdlPerWave32 = + GetMXdlPerWave(); static_assert(NumGroupsToMerge >= 1); @@ -486,35 +492,36 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle using GemmADataType = std::conditional_t, ADataType>; using GemmBDataType = std::conditional_t, BDataType>; -#define CK_GRIDWISE_GEMM_FWD_MULTIPLE_ABD_XDL_CSHUFFLE_TEMPLATE_PARAMETERS \ - GemmADataType, GemmBDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \ - EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, \ - InMemoryDataOperationEnum::Set, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, \ - KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, \ - ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, \ - ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, \ - ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, \ - ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, \ - BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, \ - BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, \ - BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, \ - CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, \ - CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, \ - CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, \ +#define CK_GRIDWISE_GEMM_FWD_MULTIPLE_ABD_XDL_CSHUFFLE_TEMPLATE_PARAMETERS \ + GemmADataType, GemmBDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \ + EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, \ + InMemoryDataOperationEnum::Set, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, \ + KPerBlock, AK1, BK1, MPerXDL_, NPerXDL_, MXdlPerWave_, NXdlPerWave*(NPerXDL / NPerXDL_), \ + ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, \ + ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, \ + ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, \ + ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, \ + BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, \ + BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, \ + BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, \ + CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle*(NPerXDL / NPerXDL_), \ + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, \ + CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, \ BComputeDataType #define CK_GRIDWISE_GEMM_FWD_MULTIPLE_D_XDL_CSHUFFLE_TEMPLATE_PARAMETERS \ GemmADataType, GemmBDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \ EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, \ - NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, \ - NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, \ - ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, \ - ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, \ - ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, \ - BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, \ - BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, \ - BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, \ - BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, \ + NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL_, \ + NPerXDL_, MXdlPerWave_, NXdlPerWave*(NPerXDL / NPerXDL_), \ + ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, \ + ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, \ + ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, \ + ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, \ + BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, \ + BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, \ + BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, \ + CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle*(NPerXDL / NPerXDL_), \ CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, \ CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, \ BComputeDataType, DoElementwiseBeforeCShuffle @@ -523,7 +530,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle GemmBDataType, GemmADataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \ EDataType, BElementwiseOperation, AElementwiseOperation, CDEElementwiseOperation, \ NumGemmKPrefetchStage, BlockSize, NPerBlock, MPerBlock, KPerBlock, BK1, AK1, NPerXDL, \ - MPerXDL, NXdlPerWave_, MXdlPerWave, BBlockTransferThreadClusterLengths_BK0_N_BK1, \ + MPerXDL, NXdlPerWave, MXdlPerWave_, BBlockTransferThreadClusterLengths_BK0_N_BK1, \ BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, \ BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, \ BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, \ @@ -536,34 +543,35 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle BComputeDataType, DoElementwiseBeforeCShuffle // Use appropriate gridwise gemm - template + template using GridwiseGemmMultipleABDBase = GridwiseGemmMultipleABD_xdl_cshuffle< CK_GRIDWISE_GEMM_FWD_MULTIPLE_ABD_XDL_CSHUFFLE_TEMPLATE_PARAMETERS>; - template + template using GridwiseGemmMultipleDBase = GridwiseGemmMultipleD_xdl_cshuffle< CK_GRIDWISE_GEMM_FWD_MULTIPLE_D_XDL_CSHUFFLE_TEMPLATE_PARAMETERS>; - template + template using GridwiseGemmMultipleDCTransposeBase = GridwiseGemmMultipleD_xdl_cshuffle< CK_GRIDWISE_GEMM_FWD_CTRANSPOSE_XDL_CSHUFFLE_TEMPLATE_PARAMETERS>; #undef CK_GRIDWISE_GEMM_FWD_MULTIPLE_ABD_XDL_CSHUFFLE_TEMPLATE_PARAMETERS #undef CK_GRIDWISE_GEMM_FWD_MULTIPLE_D_XDL_CSHUFFLE_TEMPLATE_PARAMETERS #undef CK_GRIDWISE_GEMM_FWD_CTRANSPOSE_XDL_CSHUFFLE_TEMPLATE_PARAMETERS - using GridwiseGemm64 = - std::conditional_t, - GridwiseGemmMultipleDBase>; - using GridwiseGemm32 = std::conditional_t, - GridwiseGemmMultipleDBase>; + using GridwiseGemm64 = std::conditional_t< + isMultiA || isMultiB, + GridwiseGemmMultipleABDBase, + GridwiseGemmMultipleDBase>; + using GridwiseGemm32 = std::conditional_t< + isMultiA || isMultiB, + GridwiseGemmMultipleABDBase, + GridwiseGemmMultipleDBase>; using GridwiseGemmCTranspose64 = std::conditional_t, + GridwiseGemmMultipleDCTransposeBase, GridwiseGemm64>; using GridwiseGemmCTranspose32 = std::conditional_t, + GridwiseGemmMultipleDCTransposeBase, GridwiseGemm32>; // If ADataTypes or BDataTypes is tuple, user has to pass std::array with pointers. @@ -913,14 +921,14 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle if(get_warp_size() == 64) { - if constexpr(NXdlPerWave64 > 0) + if constexpr(MXdlPerWave64 > 0) { InitGridDesc(); } } else { - if constexpr(NXdlPerWave32 > 0) + if constexpr(MXdlPerWave32 > 0) { InitGridDesc(); } @@ -1388,7 +1396,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle { if(get_warp_size() == 64) { - if constexpr(NXdlPerWave64 > 0) + if constexpr(MXdlPerWave64 > 0) { return RunImp(arg, stream_config); } @@ -1399,7 +1407,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle } else { - if constexpr(NXdlPerWave32 > 0) + if constexpr(MXdlPerWave32 > 0) { return RunImp(arg, stream_config); } @@ -1436,7 +1444,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle } } - if(!ck::is_xdl_wmma_supported()) + if(!ck::is_xdl_wmma_supported()) { return false; } @@ -1720,7 +1731,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle // check Gridwise GEMM if(get_warp_size() == 64) { - if constexpr(NXdlPerWave64 > 0) + if constexpr(MXdlPerWave64 > 0) { if constexpr(isMultiA || isMultiB) { @@ -1759,7 +1770,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle else { - if constexpr(NXdlPerWave32 > 0) + if constexpr(MXdlPerWave32 > 0) { if constexpr(isMultiA || isMultiB) { @@ -2047,8 +2058,13 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle auto str = std::stringstream(); // clang-format off - str << "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle" - << "<" + str << "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle"; + + if(get_warp_size() != 64) { + str << "_WmmaPorted"; + } + + str << "<" << BlockSize << ", " << MPerBlock << ", " << NPerBlock << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp index e69a9caa9c..0a4ca23582 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp @@ -400,9 +400,15 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 BComputeDataType> { using DeviceOp = DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3; - GET_NXDL_PER_WAVE_IMPL - static constexpr auto NXdlPerWave64 = GetNXdlPerWave(); - static constexpr auto NXdlPerWave32 = GetNXdlPerWave(); + GET_MXDL_PER_WAVE_IMPL + // Force usage of 16x16 instruction for WMMA + static constexpr index_t Wave32MaxMNPerXDL = 16; + static constexpr auto MXdlPerWave64 = GetMXdlPerWave(); + static constexpr auto MXdlPerWave32 = + GetMXdlPerWave(); static constexpr bool isMultiA = is_detected::value; static constexpr bool isMultiB = is_detected::value; @@ -563,7 +569,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 : BBlockTransferSrcScalarPerVector; // Use appropriate gridwise gemm - template + template using GridwiseGemmBase = GridwiseGemmMultiD_xdl_cshuffle_v3< tensor_layout::gemm::RowMajor, tensor_layout::gemm::ColumnMajor, @@ -585,10 +591,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 KPerBlock, AK1, BK1, - MPerXDL, - NPerXDL, - MXdlPerWave, - NXdlPerWave_, + MPerXDL_, + NPerXDL_, + MXdlPerWave_, + NXdlPerWave*(NPerXDL / NPerXDL_), ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, @@ -606,7 +612,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, - CShuffleNXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle*(NPerXDL / NPerXDL_), CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVectors, BlkGemmPipeSched, @@ -617,8 +623,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 BDataType, DoElementwiseBeforeCShuffle, DirectLoad>; - using GridwiseGemm64 = GridwiseGemmBase; - using GridwiseGemm32 = GridwiseGemmBase; + using GridwiseGemm64 = GridwiseGemmBase; + using GridwiseGemm32 = GridwiseGemmBase; // #undef GridwiseGemmV3TemplateParams @@ -1430,7 +1436,24 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 return avg_time; } - INVOKER_RUN_IMPL + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(get_warp_size() == 64) + { + if constexpr(MXdlPerWave64 > 0) + { + return RunImp(arg, stream_config); + } + } + else + { + if constexpr(MXdlPerWave32 > 0) + { + return RunImp(arg, stream_config); + } + } + return 0; + } float Run(const BaseArgument* p_arg, const StreamConfig& stream_config = StreamConfig{}) override @@ -1483,7 +1506,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 } } - if(!ck::is_xdl_wmma_supported()) + if(!ck::is_xdl_wmma_supported()) { if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { @@ -1758,7 +1784,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 if(get_warp_size() == 64) { - if constexpr(NXdlPerWave64 > 0) + if constexpr(MXdlPerWave64 > 0) { typename GridwiseGemm64::Argument gemm_arg{nullptr, nullptr, @@ -1780,7 +1806,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 } else { - if constexpr(NXdlPerWave32 > 0) + if constexpr(MXdlPerWave32 > 0) { typename GridwiseGemm32::Argument gemm_arg{nullptr, nullptr, @@ -2064,6 +2090,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 // clang-format off str << "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"; + if(get_warp_size() != 64) { + str << "_WmmaPorted"; + } + if constexpr(DirectLoad) { str << "_DirectLoad"; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp index 7c121f1482..ac0b4b663d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp @@ -206,9 +206,15 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor BComputeDataType> { using DeviceOp = DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor; - GET_NXDL_PER_WAVE_IMPL - static constexpr auto NXdlPerWave64 = GetNXdlPerWave(); - static constexpr auto NXdlPerWave32 = GetNXdlPerWave(); + GET_MXDL_PER_WAVE_IMPL + // Force usage of 16x16 instruction for WMMA + static constexpr index_t Wave32MaxMNPerXDL = 16; + static constexpr auto MXdlPerWave64 = GetMXdlPerWave(); + static constexpr auto MXdlPerWave32 = + GetMXdlPerWave(); static constexpr index_t NumDTensor = DsDataType::Size(); static constexpr index_t MaxGemmsNum = 32; @@ -409,25 +415,26 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor #define CK_GRIDWISE_GEMM_FWD_MULTIPLE_D_LARGE_TENSOR_TEMPLATE_PARAMETERS \ ADataType, BDataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, \ AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, \ - NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, \ - NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, \ - ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, \ - ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, \ - ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, \ - BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, \ - BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, \ - BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, \ - BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, \ + NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL_, \ + NPerXDL_, MXdlPerWave_, NXdlPerWave*(NPerXDL / NPerXDL_), \ + ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, \ + ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, \ + ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, \ + ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, \ + BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, \ + BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, \ + BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, \ + CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle*(NPerXDL / NPerXDL_), \ CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, \ CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1, \ AComputeDataType, DoElementwiseBeforeCShuffle // Use appropriate gridwise gemm - template + template using GridwiseGemmBase = GridwiseGemmMultipleD_xdl_cshuffle< CK_GRIDWISE_GEMM_FWD_MULTIPLE_D_LARGE_TENSOR_TEMPLATE_PARAMETERS>; #undef CK_GRIDWISE_GEMM_FWD_MULTIPLE_D_LARGE_TENSOR_TEMPLATE_PARAMETERS - using GridwiseGemm64 = GridwiseGemmBase; - using GridwiseGemm32 = GridwiseGemmBase; + using GridwiseGemm64 = GridwiseGemmBase; + using GridwiseGemm32 = GridwiseGemmBase; // desc for blockwise copy using AGridDesc_AK0_M_AK1 = @@ -607,7 +614,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor if(get_warp_size() == 64) { - if constexpr(NXdlPerWave64 > 0) + if constexpr(MXdlPerWave64 > 0) { init_gemm_args(a_grid_ptrs[i], static_cast(p_b), @@ -624,7 +631,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor } else { - if constexpr(NXdlPerWave32 > 0) + if constexpr(MXdlPerWave32 > 0) { init_gemm_args(a_grid_ptrs[i], static_cast(p_b), @@ -769,7 +776,24 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor } } - INVOKER_RUN_IMPL + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(get_warp_size() == 64) + { + if constexpr(MXdlPerWave64 > 0) + { + return RunImp(arg, stream_config); + } + } + else + { + if constexpr(MXdlPerWave32 > 0) + { + return RunImp(arg, stream_config); + } + } + return 0; + } float Run(const BaseArgument* p_arg, const StreamConfig& stream_config = StreamConfig{}) override @@ -822,7 +846,10 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor return false; } } - if(!ck::is_xdl_wmma_supported()) + if(!ck::is_xdl_wmma_supported()) { return false; } @@ -1205,8 +1232,12 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor auto str = std::stringstream(); // clang-format off - str << "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor" - << "<" + str << "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor"; + if(get_warp_size() != 64) { + str << "_WmmaPorted"; + } + + str << "<" << BlockSize << ", " << MPerBlock << ", " << NPerBlock << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp index c09e526526..b6c2030dee 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp @@ -206,9 +206,9 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle MaskingSpec> { static constexpr auto MXdlPerWave64 = - GetNXdlPerWave2(); + GetXdlPerWave2(); static constexpr auto MXdlPerWave32 = - GetNXdlPerWave2(); + GetXdlPerWave2(); static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0, "Number of dimension must be greater than 0"); diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 08d555d27c..e737421023 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -39,7 +39,7 @@ #define CK_TILE_DEVICE inline __device__ #define CK_TILE_HOST_DEVICE inline __host__ __device__ #define CK_TILE_DEVICE_EXTERN __device__ -#if __clang_major__ < 22 +#if LLVM_MAIN_REVISION < 554785 #define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__ #else #define CK_TILE_HOST_DEVICE_EXTERN