From 7e44b845b5dd4bcc28d55b4b2764e2be6418a35a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <188998872+vpietila-amd@users.noreply.github.com> Date: Fri, 17 Oct 2025 15:36:39 +0300 Subject: [PATCH 1/7] Fixed handling of split-K autodeduce argument for grouped convolution (#3024) * Fix handling of split-K autodeduce argument. * Fix clang formatting. * Test fix. * Fix clang formatting. --- ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp | 6 +++++ ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 8 ++++++ ...ce_grouped_gemm_multi_abd_xdl_fixed_nk.hpp | 6 +++++ ..._grouped_convnd_bwd_data_interface_xdl.cpp | 27 +++++++++++++++++-- 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp index ff652ebefb..febb037157 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp @@ -689,6 +689,12 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_ return false; } + // Split-K autodeduction is not supported + if(arg.k_batch_ < 1) + { + return false; + } + // Gridwise GEMM size return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_, arg.b_grid_desc_kbatch_k0_n_k1_, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index 47832e2153..4672de3504 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -1523,6 +1523,14 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 return false; } } + else + { + // Split-K autodeduction is not supported. + if(arg.k_batch_ < 1) + { + return false; + } + } const index_t ConvG = arg.b_g_k_c_xs_lengths_[0]; const index_t ConvK = arg.b_g_k_c_xs_lengths_[1]; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp index f6ec0908eb..d5d48777a0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multi_abd_xdl_fixed_nk.hpp @@ -688,6 +688,12 @@ struct DeviceGroupedGemm_Xdl_Multi_ABD_Fixed_NK static bool IsSupportedArgument(const Argument& arg) { + // Split-K autodeduction is not supported + if(arg.k_batch_ < 1) + { + return false; + } + if(ck::type_convert(arg.gemm_desc_kernel_arg_.size()) != arg.group_count_) { return false; diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp index 01f4260c43..7903c17b22 100644 --- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp +++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_interface_xdl.cpp @@ -47,10 +47,11 @@ class TestGroupedConvndBwdData : public ::testing::Test // ######| | | | | | Type| Type| Type| DataType| Type| Type| Operation| Operation| Operation| DataSpecialization| GemmM| GemmN| PrefetchStage| Size| Block| Block| Block| | | XDL| XDL| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| PerWave| PerWave| _MBlock_MPerBlock| ScalarPerVector| // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | Lengths_AK0_M_AK1| ArrangeOrder| | | PerVector| PerVector_AK1| | Lengths_BK0_N_BK1| ArrangeOrder| | | PerVector| PerVector_BK1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < NDimSpatial, OutLayout, WeiLayout, ck::Tuple<>, InLayout, DataType, DataType, AccDataType, DataType, ck::Tuple<>, DataType, Pass, Pass, Pass, ConvSpec, true, true, 1, 256, 128, 256, 32, 8, 2, 16, 16, 4, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 4>; + < NDimSpatial, OutLayout, WeiLayout, ck::Tuple<>, InLayout, DataType, DataType, AccDataType, DataType, ck::Tuple<>, DataType, Pass, Pass, Pass, ConvSpec, true, true, 1, 256, 128, 256, 32, 8, 2, 16, 16, 4, 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, 0, 1, 1, S<1, 32, 1, 8>, 4>; // clang-format on ck::utils::conv::ConvParam conv_param; + ck::index_t split_k{1}; template bool Run() @@ -112,7 +113,8 @@ class TestGroupedConvndBwdData : public ::testing::Test input_right_pads, Pass{}, Pass{}, - Pass{}); + Pass{}, + split_k); return conv.IsSupportedArgument(argument); } }; @@ -176,3 +178,24 @@ TYPED_TEST(TestGroupedConvndBwdDataDefault, VectorLoadCheck) is_supported = this->template Run<2>(); EXPECT_FALSE(is_supported); } + +TYPED_TEST(TestGroupedConvndBwdDataDefault, SplitK) +{ + if(ck::is_xdl_supported()) + { + // SplitK = 1 + this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}; + this->split_k = 1; + bool is_supported = this->template Run<2>(); + EXPECT_TRUE(is_supported); + + // Split-K autodeduce + this->split_k = -1; + is_supported = this->template Run<2>(); + EXPECT_FALSE(is_supported); + } + else + { + GTEST_SKIP() << "XDL ops not supported on this device"; + } +} From 8a4cd32d8692c54a3a500ec65d2623c9d27bd7f5 Mon Sep 17 00:00:00 2001 From: Johannes Graner Date: Fri, 17 Oct 2025 18:28:38 +0200 Subject: [PATCH 2/7] Pre-commit in CI (#3029) * Pre-commit in CI * Specify python version, and install dos2unix for remod * Refactor remod hook to correctly install dependencies * Run pre-commit --- .github/workflows/pre-commit.yml | 16 ++++++++++++++++ .pre-commit-config.yaml | 11 +++++++---- example/ck_tile/remod.py | 9 +++++++-- include/ck_tile/ops/gemm.hpp | 3 ++- include/ck_tile/remod.py | 8 ++++++-- script/install_precommit.sh | 3 --- script/remod_for_ck_tile.py | 13 +++++++++++++ script/remod_for_ck_tile.sh | 7 ------- 8 files changed, 51 insertions(+), 19 deletions(-) create mode 100644 .github/workflows/pre-commit.yml create mode 100755 script/remod_for_ck_tile.py delete mode 100755 script/remod_for_ck_tile.sh diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000..16f7e2539c --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,16 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [develop] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + with: + python-version: '3.12' + - uses: pre-commit/action@v3.0.1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 03d33757b0..04ebc6b45a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,9 +32,12 @@ repos: language: script types_or: [c++, text] verbose: true - - id: run-remod-if-ck-tile-changed - name: Run remod.py if ck_tile files changed - entry: script/remod_for_ck_tile.sh - language: script + - id: remod-ck-tile + name: Run ck_tile remod.py + entry: python script/remod_for_ck_tile.py + language: python files: '^(include|example)/ck_tile/.*$' + additional_dependencies: + - dos2unix + - clang-format==18.1.3 pass_filenames: false diff --git a/example/ck_tile/remod.py b/example/ck_tile/remod.py index b2ac7c52bf..4fa3a4e430 100644 --- a/example/ck_tile/remod.py +++ b/example/ck_tile/remod.py @@ -1,3 +1,4 @@ +import os import pathlib from pathlib import Path import subprocess @@ -10,8 +11,12 @@ for p in sorted(Path("./").rglob("*")): # formatting for x in all_files: - subprocess.Popen(f"dos2unix -n {str(x)}", shell=True) - cmd = f"clang-format-18 -style=file -i {str(x)}" + subprocess.Popen( + f"python -m dos2unix {str(x)} {str(x)}", + shell=True, + stdout=open(os.devnull, "wb"), + ) + cmd = f"clang-format -style=file -i {str(x)}" # for xp in x.parents: # print(get_file_base(x)) subprocess.Popen(cmd, shell=True) diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 6b587f81d5..e1026485d7 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -33,9 +33,10 @@ #include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" -#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp" #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp" +#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp" #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp" diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py index bd940036bd..a8ff2defe5 100644 --- a/include/ck_tile/remod.py +++ b/include/ck_tile/remod.py @@ -86,8 +86,12 @@ class submodule_t: submodule = submodule_t() # formatting for x in all_files: - subprocess.Popen(f"dos2unix -n {str(x)}", shell=True) - cmd = f"clang-format-18 -style=file -i {str(x)}" + subprocess.Popen( + f"python -m dos2unix {str(x)} {str(x)}", + shell=True, + stdout=open(os.devnull, "wb"), + ) + cmd = f"clang-format -style=file -i {str(x)}" # for xp in x.parents: # print(get_file_base(x)) subprocess.Popen(cmd, shell=True) diff --git a/script/install_precommit.sh b/script/install_precommit.sh index fd1840290e..545dcfa666 100755 --- a/script/install_precommit.sh +++ b/script/install_precommit.sh @@ -13,9 +13,6 @@ echo "I: Creating and activating virtual environment for pre-commit..." python3 -m venv "$(dirname "$0")/../.venv" source "$(dirname "$0")/../.venv/bin/activate" -echo "I: Installing tools required for pre-commit checks..." -run_and_check pip install dos2unix -run_and_check pip install clang-format==18.1.3 echo "I: Installing pre-commit in virtual environment..." run_and_check pip install pre-commit run_and_check pre-commit install diff --git a/script/remod_for_ck_tile.py b/script/remod_for_ck_tile.py new file mode 100755 index 0000000000..7601c9d619 --- /dev/null +++ b/script/remod_for_ck_tile.py @@ -0,0 +1,13 @@ +import os + +root_dir = os.getcwd() +ck_tile_include = root_dir + "/include/ck_tile" +ck_tile_example = root_dir + "/example/ck_tile" + +# Run for include +os.chdir(ck_tile_include) +_ = os.system("python remod.py") + +# Run for example +os.chdir(ck_tile_example) +_ = os.system("python remod.py") diff --git a/script/remod_for_ck_tile.sh b/script/remod_for_ck_tile.sh deleted file mode 100755 index 7b99ec60bd..0000000000 --- a/script/remod_for_ck_tile.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -# Copyright © Advanced Micro Devices, Inc., or its affiliates. -# SPDX-License-Identifier: MIT - -# Run remod.py in both required locations -(cd include/ck_tile/ && python3 remod.py) -(cd example/ck_tile/ && python3 remod.py) From 352dee5225cede21e82bb96f530425e54139f251 Mon Sep 17 00:00:00 2001 From: Emily Martins <65371150+ecamartins@users.noreply.github.com> Date: Fri, 17 Oct 2025 10:33:38 -0600 Subject: [PATCH 3/7] Fix CK Tile Stream-K BF16 Validation Errors (#3039) Prior to this change, the number of accumulations passed into calculate_rtol_atol was 1. That said, in most cases, this is not correct when there are multiple workgroups contributing to the same macro tile in C. This change ensures uses the function estimate_num_wgs_per_tile, which was extracted into a common file and generalized, to estimate the number of workgroups per macro tile. This estimate is passed into calculate_rtol_atol to ensure we get a better relative and absolute tolerance. --- .../40_streamk_gemm/run_gemm_example.inc | 57 ++++++------------- .../40_streamk_gemm/streamk_gemm_basic.cpp | 14 ++++- include/ck_tile/ops/common/streamk_common.hpp | 29 ++++++++++ .../gemm_streamk/test_gemm_streamk.hpp | 31 +++++++--- 4 files changed, 80 insertions(+), 51 deletions(-) diff --git a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc index 5fdf6b29ef..6dd054ee11 100644 --- a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc +++ b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc @@ -2,29 +2,6 @@ // SPDX-License-Identifier: MIT #pragma once -// Estimate the number of WGs contributing to the same macro tile in C -template -int estimate_num_wgs_per_tile(const TilePartitioner& tile_partitioner) -{ - // In the case of non-atomic reduction or DP only, there will always be 1 WG contributing to a - // macro time in C - int num_wgs_per_tile = 1; - - // Otherwise, for atomics, multiple WGs may be contributing to the same macro tile in C - if(tile_partitioner.sk_num_blocks > 0 && - ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic) - { - // Determine the number of iterations per WG for a given macro tile in C - uint32_t k_iters_per_block = tile_partitioner.k_iters_per_big_block - 1; - - // Estimate the number of WGs per macro tile - num_wgs_per_tile = (tile_partitioner.k_iters_per_tile.get() / (k_iters_per_block)) + - ((tile_partitioner.k_iters_per_tile.get() % k_iters_per_block) != 0); - } - - return std::max(num_wgs_per_tile, 1); -} - template static constexpr inline auto is_row_major(Layout) { @@ -65,7 +42,8 @@ template -std::tuple gemm(const ck_tile::StreamKHostArgs& args, const ck_tile::stream_config& s); +std::tuple gemm(const ck_tile::StreamKHostArgs& args, + const ck_tile::stream_config& s); template -std::tuple invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, - ck_tile::DeviceMem& b_k_n_dev_buf, - ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::index_t M, - ck_tile::index_t N, - ck_tile::index_t K, - ck_tile::index_t stride_A, - ck_tile::index_t stride_B, - ck_tile::index_t stride_C, - int n_warmup, - int n_repeat, - bool flush_cache, - ck_tile::StreamKReductionStrategy reduction_strategy, - uint32_t num_sk_blocks) +std::tuple +invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C, + int n_warmup, + int n_repeat, + bool flush_cache, + ck_tile::StreamKReductionStrategy reduction_strategy, + uint32_t num_sk_blocks) { ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(), b_k_n_dev_buf.GetDeviceBuffer(), @@ -105,7 +84,7 @@ std::tuple invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, reduction_strategy, num_sk_blocks}; - std::tuple ave_time_and_batch; + std::tuple ave_time_and_batch; if(args.reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic) { diff --git a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp index bb6b1eb413..40709e38e2 100644 --- a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp +++ b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp @@ -3,6 +3,7 @@ #include "gemm_utils.hpp" #include "run_gemm_example.inc" +#include "ck_tile/ops/common.hpp" template -std::tuple gemm(const ck_tile::StreamKHostArgs& args, const ck_tile::stream_config& s) +std::tuple gemm(const ck_tile::StreamKHostArgs& args, + const ck_tile::stream_config& s) { using GemmShape = ck_tile::TileGemmShape< @@ -42,7 +44,7 @@ std::tuple gemm(const ck_tile::StreamKHostArgs& args, const ck_tile: GemmConfig::NumWaveGroups, GemmConfig::Preshuffle>; - const auto Run = [&](const auto memory_operation) -> std::tuple { + const auto Run = [&](const auto memory_operation) -> std::tuple { // We create the GEMM pipeline without specifying has_hot_loop or tail_num. // This is because num_loop can vary (a) per WG and (b) per iteration of the Stream-K // while loop. Instead, has_hot_loop and tail_num are determined in the Stream-K @@ -113,7 +115,13 @@ std::tuple gemm(const ck_tile::StreamKHostArgs& args, const ck_tile: preprocess, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - int num_wgs_per_tile = estimate_num_wgs_per_tile(kargs.tile_partitioner); + ck_tile::index_t num_wgs_per_tile = ck_tile::estimate_num_wgs_per_tile( + kargs.tile_partitioner.sk_num_blocks, + // k_iters_per_big_block could be 1, which indicates that all Stream-K workgroups are + // big and each does one iteration. Thus, we ensure the value passed in is at least 1 to + // avoid division by zero errors. + ck_tile::max(kargs.tile_partitioner.k_iters_per_big_block - 1, 1u), + kargs.tile_partitioner.k_iters_per_tile.get()); return std::tuple{ave_time, num_wgs_per_tile}; }; diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp index 5dbe6223c4..c01e967dcd 100644 --- a/include/ck_tile/ops/common/streamk_common.hpp +++ b/include/ck_tile/ops/common/streamk_common.hpp @@ -11,4 +11,33 @@ enum StreamKReductionStrategy : uint32_t Atomic = 0u, Reduction = 1u }; + +/** + * @brief Estimates the number of Stream-K workgroups per macro tile in the C tensor. + * + * @param sk_ctas Number of Stream-K workgroups. + * @param iters_per_sk_cta Number of iterations per Stream-K workgroup. + * @param iters_per_tile Number of iterations per tile (i.e., the number of macro tiles in the K + * dimension). + * @return ck_tile::index_t An estimate of the number of workgroups per macro tile in the C tensor. + * @note It is assumed that `iters_per_sk_cta` > 0. + */ +template +ck_tile::index_t +estimate_num_wgs_per_tile(index_t sk_ctas, index_t iters_per_sk_cta, index_t iters_per_tile) +{ + // In the case of non-atomic reduction or data-parallel only, there will always be 1 workgroup + // writing final results to a given macro tile in C. + int num_wgs_per_tile = 1; + + // Otherwise, for atomics, multiple workgroups may be writing to the same macro tile in C. + if(sk_ctas > 0 && ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic) + { + // Estimate the number of workgroups per macro tile. + num_wgs_per_tile = + (iters_per_tile / iters_per_sk_cta) + ((iters_per_tile % iters_per_sk_cta) != 0); + } + + return std::max(num_wgs_per_tile, 1); +} } // namespace ck_tile diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp index da0b8d153d..c341789435 100644 --- a/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp +++ b/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp @@ -10,6 +10,7 @@ #include #include "ck_tile/host.hpp" +#include "ck_tile/ops/common.hpp" #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" @@ -50,10 +51,10 @@ class TestCkTileStreamK : public ::testing::Test bool PadK = true, bool Preshuffle = false, bool TransposeC = false> - bool invoke_streamk(const ck_tile::StreamKHostArgs& args, - const ck_tile::stream_config& s, - int num_cu, - int occupancy) + std::tuple invoke_streamk(const ck_tile::StreamKHostArgs& args, + const ck_tile::stream_config& s, + int num_cu, + int occupancy) { constexpr bool kPadM = PadM; constexpr bool kPadN = PadN; @@ -129,7 +130,7 @@ class TestCkTileStreamK : public ::testing::Test if(!Kernel::IsSupportedArgument(kargs)) { - return false; + return std::tuple{false, -1}; } dim3 grid_dims = Kernel::GridSize(kargs.tile_partitioner); @@ -138,7 +139,16 @@ class TestCkTileStreamK : public ::testing::Test ck_tile::launch_kernel( s, ck_tile::make_kernel(Kernel{}, grid_dims, block_dims, 0, kargs)); - return true; + ck_tile::index_t num_accumulations_per_tile = + ck_tile::estimate_num_wgs_per_tile( + kargs.tile_partitioner.sk_num_blocks, + // k_iters_per_big_block could be 1, which indicates that all blocks are + // big and each does one iteration. Thus, we ensure the value passed in is at + // least 1 to avoid division by zero errors. + ck_tile::max(kargs.tile_partitioner.k_iters_per_big_block - 1, 1u), + kargs.tile_partitioner.k_iters_per_tile.get()); + + return std::tuple{true, num_accumulations_per_tile}; }; return Run(ck_tile::integral_constant( - args, ck_tile::stream_config{nullptr, false, 0, 0, 1}, num_cu, occupancy)) + const auto [is_valid_instance, num_accumulations_per_tile] = + invoke_streamk( + args, ck_tile::stream_config{nullptr, false, 0, 0, 1}, num_cu, occupancy); + + if(!is_valid_instance) { GTEST_SKIP() << "Skipping this test: The kernel cannot solve the problem\n"; } @@ -256,7 +269,7 @@ class TestCkTileStreamK : public ::testing::Test const float max_accumulated_value = *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end()); const auto rtol_atol = calculate_rtol_atol( - K, /*kbatch*/ 1, max_accumulated_value); + K, num_accumulations_per_tile, max_accumulated_value); bool pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref, From 889ffc0b1d9a6913ee84f44c08d690a1e4d4828d Mon Sep 17 00:00:00 2001 From: Yashvardhan Agarwal Date: Fri, 17 Oct 2025 19:49:21 +0300 Subject: [PATCH 4/7] fix identity values in Max and AbsMax (#3048) - The identity value method returned the minimum positive number while we need the lowest number for Max and AbsMax operations --- include/ck_tile/core/utility/reduce_operator.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp index a698c91e45..f870bd99d6 100644 --- a/include/ck_tile/core/utility/reduce_operator.hpp +++ b/include/ck_tile/core/utility/reduce_operator.hpp @@ -73,7 +73,7 @@ struct Max std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { - return numeric::min(); + return numeric::lowest(); }; template || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { - return numeric::min(); + return numeric::lowest(); }; template Date: Wed, 15 Oct 2025 02:39:04 +0000 Subject: [PATCH 5/7] docs: add inline comments about flush_cache and rotating buffer --- include/ck_tile/host/flush_icache.hpp | 6 ++++ include/ck_tile/host/rotating_buffers.hpp | 41 ++++++++++++++++++----- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/include/ck_tile/host/flush_icache.hpp b/include/ck_tile/host/flush_icache.hpp index 9230b50a13..f4852252be 100644 --- a/include/ck_tile/host/flush_icache.hpp +++ b/include/ck_tile/host/flush_icache.hpp @@ -6,6 +6,12 @@ #include namespace ck_tile { +// GPU kernel to invalidate instruction cache for accurate benchmarking. +// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit, +// forcing subsequent kernel runs to fetch instructions from HBM instead of cache. +// 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel +// exits. Without these NOPs, the flush may not finish, leading to inconsistent +// timing measurements where some instructions remain cached. static __global__ void flush_cache() { asm __volatile__("s_icache_inv \n\t" diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp index 86f68ad084..154d67fb8e 100644 --- a/include/ck_tile/host/rotating_buffers.hpp +++ b/include/ck_tile/host/rotating_buffers.hpp @@ -9,6 +9,20 @@ namespace ck_tile { +// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking. +// +// Purpose: +// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache +// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast +// timing measurements. This wrapper rotates through multiple copies of buffers at different +// memory addresses to force cache misses. +// +// How it works: +// Constructor: Creates rotating_count copies of matrices A and B in GPU memory +// Next(): Switches pointers to the next buffer copy (cycles through all copies) +// Destructor: Frees extra buffer copies and restores original pointers +// +// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements. template struct RotatingMemWrapper { @@ -24,15 +38,18 @@ struct RotatingMemWrapper size_a(size_a_), size_b(size_b_) { + // Store original buffer pointers as first entry p_a_grids.push_back(a_ptr); p_b_grids.push_back(b_ptr); + + // Create (rotating_count - 1) additional copies at different memory addresses for(size_t i = 1; i < rotating_count; i++) { { void* pADeviceBuf; HIP_CHECK_ERROR(hipMalloc(static_cast(&pADeviceBuf), size_a_)); - HIP_CHECK_ERROR(hipMemcpy(static_cast(pADeviceBuf), - const_cast(p_a_grids[0]), + HIP_CHECK_ERROR(hipMemcpy(static_cast(pADeviceBuf), // target buffer + const_cast(p_a_grids[0]), // source buffer size_a_, hipMemcpyDeviceToDevice)); p_a_grids.push_back(pADeviceBuf); @@ -41,19 +58,21 @@ struct RotatingMemWrapper { void* pBDeviceBuf; HIP_CHECK_ERROR(hipMalloc(static_cast(&pBDeviceBuf), size_b_)); - HIP_CHECK_ERROR(hipMemcpy(static_cast(pBDeviceBuf), - const_cast(p_b_grids[0]), + HIP_CHECK_ERROR(hipMemcpy(static_cast(pBDeviceBuf), // target buffer + const_cast(p_b_grids[0]), // source buffer size_b_, hipMemcpyDeviceToDevice)); p_b_grids.push_back(pBDeviceBuf); } } } + // Rotate to the next buffer copy. Call this before each kernel run to use different + // memory addresses, forcing the GPU to fetch data from HBM instead of cache. void Next() { if(rotating_count > 1) { - std::size_t idx = iter++ % rotating_count; + std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies a_ptr = p_a_grids[idx]; b_ptr = p_b_grids[idx]; } @@ -63,15 +82,16 @@ struct RotatingMemWrapper std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b << ", rotating_count: " << rotating_count << "}" << std::endl; } + // Cleanup: Free all extra buffer copies (keeping original) and restore original pointers ~RotatingMemWrapper() noexcept { if(rotating_count > 1) { - // restore ptr + // Restore original buffer pointers a_ptr = p_a_grids[0]; b_ptr = p_b_grids[0]; - // free device mem + // Free extra buffer copies (index 0 is the original, don't free it) for(size_t i = 1; i < rotating_count; i++) { ck_tile::hip_check_error(hipFree(const_cast(p_a_grids[i]))); @@ -94,7 +114,12 @@ inline void flush_icache() { hipDeviceProp_t deviceProps; HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0)); - int32_t gpu_block3 = deviceProps.multiProcessorCount * 60; + + // Over-provision blocks to ensure all CUs execute the flush instruction. + // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage. + // 60x over-provisioning provides statistical certainty that every CU gets at least one block. + constexpr int32_t blocks_per_cu = 60; + int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu; ck_tile::flush_cache<<>>(); HIP_CHECK_ERROR(hipGetLastError()); From d88ea05c844cd159a14213b73a5818a43c5b79e6 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 17 Oct 2025 19:52:22 -0700 Subject: [PATCH 6/7] disable aiter test gemm_a8w8_blockscale (#3049) --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3fbcdb5849..43b51d4f0f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -909,7 +909,7 @@ def run_aiter_tests(Map conf=[:]){ sh "rocminfo" sh "python3 --version" sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py" - sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py" + //sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py" //temporarily disable sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py" sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py" sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py" From af3786fe0814a75646ff3194f86eab0e24b047e6 Mon Sep 17 00:00:00 2001 From: BrianHarrisonAMD <169072757+BrianHarrisonAMD@users.noreply.github.com> Date: Sun, 19 Oct 2025 17:09:21 -0600 Subject: [PATCH 7/7] Add dvc pull step (#3056) * Add dvc pull step * Remove CD * Add details about LOGNAME and fail if dvc isn't installed --- .github/workflows/therock-ci-linux.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index beaabbe763..f4d0c0063c 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -35,6 +35,15 @@ jobs: with: repository: "ROCm/rocm-libraries" + - name: Pull DVC files for rocm-libraries # LOGNAME details here https://github.com/ROCm/rocm-libraries/pull/1617 + run: | + if command -v dvc &> /dev/null; then + echo "dvc detected" + else + echo "Warning, dvc not detected!" + fi + LOGNAME=github-runner dvc pull -v + - name: Checkout composable_kernel repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: