diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index f4d0c0063c..86d134e456 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -53,8 +53,8 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit path: "TheRock" + ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit - name: Setup ccache run: | @@ -77,6 +77,8 @@ jobs: - name: Patch rocm-libraries run: | git config --global --add safe.directory '*' + # Remove patches here if they cannot be applied cleanly, and they have not been deleted from TheRock repo + rm -f ./TheRock/patches/amd-mainline/rocm-libraries/0008-Revert-remove-options-no-enumerate-966.patch git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch - name: Install python deps @@ -128,7 +130,7 @@ jobs: run: | python3 TheRock/build_tools/github_actions/post_build_upload.py \ --run-id ${{ github.run_id }} \ - --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \ + --artifact-group ${{ env.AMDGPU_FAMILIES }} \ --build-dir TheRock/build \ --upload diff --git a/.github/workflows/therock-test-component.yml b/.github/workflows/therock-test-component.yml index 1ccc1d57bc..27eff4fdb0 100644 --- a/.github/workflows/therock-test-component.yml +++ b/.github/workflows/therock-test-component.yml @@ -51,13 +51,13 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: "ROCm/TheRock" - ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit + ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' with: ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} - AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + ARTIFACT_GROUP: ${{ inputs.amdgpu_families }} OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} VENV_DIR: ${{ env.VENV_DIR }} FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }} diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml index efb5a6b1a0..81632fce48 100644 --- a/.github/workflows/therock-test-packages.yml +++ b/.github/workflows/therock-test-packages.yml @@ -27,7 +27,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" - ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit + ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit - name: "Configuring CI options" env: diff --git a/.gitignore b/.gitignore index 6641e5bc58..2641a661d8 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,12 @@ docs/doxygen/xml cmake-build*/ build*/ +# LSP configuration +.clangd + +# User-defined CMake presets +CMakeUserPresets.json + # Python virtualenv .venv/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 049da5637f..7b4990dba4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -683,6 +683,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY) PACKAGE_NAME examples ) add_subdirectory(example) + + add_subdirectory(tutorial) + rocm_package_setup_component(tutorials + LIBRARY_NAME composablekernel + PACKAGE_NAME tutorials + ) add_subdirectory(tile_engine) if(BUILD_TESTING) add_subdirectory(test) diff --git a/Jenkinsfile b/Jenkinsfile index aa4045186e..80db99684a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1836,10 +1836,11 @@ pipeline { } agent{ label rocmnode("gfx90a") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCK_CXX_STANDARD="17" -DCMAKE_CXX_FLAGS=" -O3 " """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ -DGPU_TARGETS="gfx90a" \ + -DCK_CXX_STANDARD="17" \ -DCMAKE_CXX_COMPILER="${build_compiler()}" \ -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ diff --git a/example/01_gemm/gemm_wmma_fp8_v3.cpp b/example/01_gemm/gemm_wmma_fp8_v3.cpp index 0376820b7b..2f8eac113b 100644 --- a/example/01_gemm/gemm_wmma_fp8_v3.cpp +++ b/example/01_gemm/gemm_wmma_fp8_v3.cpp @@ -13,7 +13,7 @@ using CDataType = ck::bhalf_t; using ComputeTypeA = ck::f8_t; using ComputeTypeB = ck::f8_t; -using ALayout = Row; +using ALayout = Col; using BLayout = Col; using CLayout = Row; @@ -30,13 +30,13 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 64, - 8, 8, + 16, 16, // AK1, BK1 16, 16, 4, 2, + S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, + 1, 4, 16, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, - 2, 8, 8, 0, - S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, - 2, 8, 8, 0, + 2, 16, 16, 0, 1, 1, S<1, 32, 1, 4>, 8, ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, ComputeTypeA, ComputeTypeB>; diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp index 63343df3a8..6f30bdaa73 100644 --- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp +++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp @@ -221,8 +221,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co b0_tensors_device.emplace_back(std::make_unique( sizeof(B0DataType) * problem_size.Ns[i] * problem_size.Ks[i])); - b1_tensors_device.emplace_back( - std::make_unique(sizeof(B1DataType) * problem_size.Ns[i])); + b1_tensors_device.emplace_back(std::make_unique( + sizeof(B1DataType) * problem_size.Ns[i] * problem_size.Ks[i])); d0_tensors_device.emplace_back( std::make_unique(sizeof(D0DataType) * problem_size.Ns[i])); diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp index 1adf039b70..ebb73ca7e0 100644 --- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp +++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp @@ -181,7 +181,7 @@ constexpr ck::index_t ScaleBlockSize = 32; // scaling block constexpr ck::index_t KPerBlock = 256 / DataPackedSize; // 256 f4 = 128 fp4x2 static constexpr ck::index_t Nswizzle = false; static constexpr ck::index_t ActOP = 0; // 0: gelu_and_mul, 1: silu_and_mul -static constexpr ck::index_t MPerBlock = 128; +static constexpr ck::index_t MPerBlock = 32; static constexpr bool MulRoutedWeight = true; // clang-format off @@ -190,10 +190,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmMXBPreShuffl A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, ScaleBlockSize, 256, - MPerBlock, 64, KPerBlock, + MPerBlock, 128, KPerBlock, 16, 16, 16, 16, - 4, 2, + 2, 2, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 2, 2, S<1, 32, 1, 8>, S<8, 1, 1, 1>, @@ -213,10 +213,10 @@ int main(int argc, char* argv[]) ck::index_t sorted_size = sorted_tile_num * MPerBlock; ck::index_t valid_size = valid_tile_num * MPerBlock; - ck::index_t N = 6144; - ck::index_t K = 4096; + ck::index_t N = 7168; + ck::index_t K = 256; ck::index_t experts = 8; - ck::index_t tokens = 832; + ck::index_t tokens = 208; ck::index_t topk = 2; if(argc == 1) diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc index d5f164c40f..703ab810d8 100644 --- a/example/ck_tile/03_gemm/run_gemm_example.inc +++ b/example/ck_tile/03_gemm/run_gemm_example.inc @@ -309,8 +309,8 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser, if(init_method == 0) { - ck_tile::FillUniformDistribution{-5.f, 5.f}(a_m_k); - ck_tile::FillUniformDistribution{-5.f, 5.f}(b_k_n); + ck_tile::FillUniformDistribution{-2.f, 2.f}(a_m_k); + ck_tile::FillUniformDistribution{-2.f, 2.f}(b_k_n); } else if(init_method == 1) { diff --git a/example/ck_tile/20_grouped_convolution/conv_configs.hpp b/example/ck_tile/20_grouped_convolution/conv_configs.hpp index c688215280..8a2a60a197 100644 --- a/example/ck_tile/20_grouped_convolution/conv_configs.hpp +++ b/example/ck_tile/20_grouped_convolution/conv_configs.hpp @@ -14,28 +14,14 @@ struct ConvConfigBase { - static constexpr bool kPadM = true; - static constexpr bool kPadN = true; - static constexpr bool kPadK = true; - - static constexpr bool PermuteA = false; - static constexpr bool PermuteB = false; - - static constexpr bool TransposeC = false; - static constexpr bool UseStructuredSparsity = false; - static constexpr ck_tile::index_t VectorSizeA = 4; static constexpr ck_tile::index_t VectorSizeB = 8; static constexpr ck_tile::index_t VectorSizeC = 8; - static constexpr int kBlockPerCu = 1; - static constexpr ck_tile::index_t TileParitionerGroupNum = 8; - static constexpr ck_tile::index_t TileParitionerM01 = 4; + static constexpr int kBlockPerCu = 1; static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave; static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3; static constexpr ck_tile::index_t NumWaveGroups = 1; - static constexpr bool Preshuffle = false; - static constexpr bool TiledMMAPermuteN = false; static constexpr ck_tile::index_t NumGroupsToMerge = 1; }; @@ -216,9 +202,9 @@ struct ConvConfigComputeV5 : public ConvConfigBase static constexpr ck_tile::index_t N_Warp_Tile = 32; static constexpr ck_tile::index_t K_Warp_Tile = 16; - static constexpr bool DoubleSmemBuffer = false; - static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V5; - static constexpr ck_tile::index_t NumWaNumWaveGroups = 2; + static constexpr bool DoubleSmemBuffer = false; + static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V5; + static constexpr ck_tile::index_t NumWaveGroups = 2; }; template diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp index ad593b1418..8ea892a215 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp @@ -14,7 +14,7 @@ #include "grouped_convolution_backward_data_invoker.hpp" #include "run_grouped_convolution_bwd_data_example.inc" -template