diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000..16f7e2539c --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,16 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [develop] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + with: + python-version: '3.12' + - uses: pre-commit/action@v3.0.1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 03d33757b0..04ebc6b45a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,9 +32,12 @@ repos: language: script types_or: [c++, text] verbose: true - - id: run-remod-if-ck-tile-changed - name: Run remod.py if ck_tile files changed - entry: script/remod_for_ck_tile.sh - language: script + - id: remod-ck-tile + name: Run ck_tile remod.py + entry: python script/remod_for_ck_tile.py + language: python files: '^(include|example)/ck_tile/.*$' + additional_dependencies: + - dos2unix + - clang-format==18.1.3 pass_filenames: false diff --git a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc index 5fdf6b29ef..6dd054ee11 100644 --- a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc +++ b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc @@ -2,29 +2,6 @@ // SPDX-License-Identifier: MIT #pragma once -// Estimate the number of WGs contributing to the same macro tile in C -template -int estimate_num_wgs_per_tile(const TilePartitioner& tile_partitioner) -{ - // In the case of non-atomic reduction or DP only, there will always be 1 WG contributing to a - // macro time in C - int num_wgs_per_tile = 1; - - // Otherwise, for atomics, multiple WGs may be contributing to the same macro tile in C - if(tile_partitioner.sk_num_blocks > 0 && - ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic) - { - // Determine the number of iterations per WG for a given macro tile in C - uint32_t k_iters_per_block = tile_partitioner.k_iters_per_big_block - 1; - - // Estimate the number of WGs per macro tile - num_wgs_per_tile = (tile_partitioner.k_iters_per_tile.get() / (k_iters_per_block)) + - ((tile_partitioner.k_iters_per_tile.get() % k_iters_per_block) != 0); - } - - return std::max(num_wgs_per_tile, 1); -} - template static constexpr inline auto is_row_major(Layout) { @@ -65,7 +42,8 @@ template -std::tuple gemm(const ck_tile::StreamKHostArgs& args, const ck_tile::stream_config& s); +std::tuple gemm(const ck_tile::StreamKHostArgs& args, + const ck_tile::stream_config& s); template -std::tuple invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, - ck_tile::DeviceMem& b_k_n_dev_buf, - ck_tile::DeviceMem& c_m_n_dev_buf, - ck_tile::index_t M, - ck_tile::index_t N, - ck_tile::index_t K, - ck_tile::index_t stride_A, - ck_tile::index_t stride_B, - ck_tile::index_t stride_C, - int n_warmup, - int n_repeat, - bool flush_cache, - ck_tile::StreamKReductionStrategy reduction_strategy, - uint32_t num_sk_blocks) +std::tuple +invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, + ck_tile::DeviceMem& b_k_n_dev_buf, + ck_tile::DeviceMem& c_m_n_dev_buf, + ck_tile::index_t M, + ck_tile::index_t N, + ck_tile::index_t K, + ck_tile::index_t stride_A, + ck_tile::index_t stride_B, + ck_tile::index_t stride_C, + int n_warmup, + int n_repeat, + bool flush_cache, + ck_tile::StreamKReductionStrategy reduction_strategy, + uint32_t num_sk_blocks) { ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(), b_k_n_dev_buf.GetDeviceBuffer(), @@ -105,7 +84,7 @@ std::tuple invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf, reduction_strategy, num_sk_blocks}; - std::tuple ave_time_and_batch; + std::tuple ave_time_and_batch; if(args.reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic) { diff --git a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp index bb6b1eb413..40709e38e2 100644 --- a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp +++ b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp @@ -3,6 +3,7 @@ #include "gemm_utils.hpp" #include "run_gemm_example.inc" +#include "ck_tile/ops/common.hpp" template -std::tuple gemm(const ck_tile::StreamKHostArgs& args, const ck_tile::stream_config& s) +std::tuple gemm(const ck_tile::StreamKHostArgs& args, + const ck_tile::stream_config& s) { using GemmShape = ck_tile::TileGemmShape< @@ -42,7 +44,7 @@ std::tuple gemm(const ck_tile::StreamKHostArgs& args, const ck_tile: GemmConfig::NumWaveGroups, GemmConfig::Preshuffle>; - const auto Run = [&](const auto memory_operation) -> std::tuple { + const auto Run = [&](const auto memory_operation) -> std::tuple { // We create the GEMM pipeline without specifying has_hot_loop or tail_num. // This is because num_loop can vary (a) per WG and (b) per iteration of the Stream-K // while loop. Instead, has_hot_loop and tail_num are determined in the Stream-K @@ -113,7 +115,13 @@ std::tuple gemm(const ck_tile::StreamKHostArgs& args, const ck_tile: preprocess, ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); - int num_wgs_per_tile = estimate_num_wgs_per_tile(kargs.tile_partitioner); + ck_tile::index_t num_wgs_per_tile = ck_tile::estimate_num_wgs_per_tile( + kargs.tile_partitioner.sk_num_blocks, + // k_iters_per_big_block could be 1, which indicates that all Stream-K workgroups are + // big and each does one iteration. Thus, we ensure the value passed in is at least 1 to + // avoid division by zero errors. + ck_tile::max(kargs.tile_partitioner.k_iters_per_big_block - 1, 1u), + kargs.tile_partitioner.k_iters_per_tile.get()); return std::tuple{ave_time, num_wgs_per_tile}; }; diff --git a/example/ck_tile/remod.py b/example/ck_tile/remod.py index b2ac7c52bf..4fa3a4e430 100644 --- a/example/ck_tile/remod.py +++ b/example/ck_tile/remod.py @@ -1,3 +1,4 @@ +import os import pathlib from pathlib import Path import subprocess @@ -10,8 +11,12 @@ for p in sorted(Path("./").rglob("*")): # formatting for x in all_files: - subprocess.Popen(f"dos2unix -n {str(x)}", shell=True) - cmd = f"clang-format-18 -style=file -i {str(x)}" + subprocess.Popen( + f"python -m dos2unix {str(x)} {str(x)}", + shell=True, + stdout=open(os.devnull, "wb"), + ) + cmd = f"clang-format -style=file -i {str(x)}" # for xp in x.parents: # print(get_file_base(x)) subprocess.Popen(cmd, shell=True) diff --git a/include/ck_tile/core/utility/reduce_operator.hpp b/include/ck_tile/core/utility/reduce_operator.hpp index a698c91e45..f870bd99d6 100644 --- a/include/ck_tile/core/utility/reduce_operator.hpp +++ b/include/ck_tile/core/utility/reduce_operator.hpp @@ -73,7 +73,7 @@ struct Max std::is_same_v || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { - return numeric::min(); + return numeric::lowest(); }; template || std::is_same_v>> CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue() { - return numeric::min(); + return numeric::lowest(); }; template namespace ck_tile { +// GPU kernel to invalidate instruction cache for accurate benchmarking. +// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit, +// forcing subsequent kernel runs to fetch instructions from HBM instead of cache. +// 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel +// exits. Without these NOPs, the flush may not finish, leading to inconsistent +// timing measurements where some instructions remain cached. static __global__ void flush_cache() { asm __volatile__("s_icache_inv \n\t" diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp index 86f68ad084..154d67fb8e 100644 --- a/include/ck_tile/host/rotating_buffers.hpp +++ b/include/ck_tile/host/rotating_buffers.hpp @@ -9,6 +9,20 @@ namespace ck_tile { +// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking. +// +// Purpose: +// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache +// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast +// timing measurements. This wrapper rotates through multiple copies of buffers at different +// memory addresses to force cache misses. +// +// How it works: +// Constructor: Creates rotating_count copies of matrices A and B in GPU memory +// Next(): Switches pointers to the next buffer copy (cycles through all copies) +// Destructor: Frees extra buffer copies and restores original pointers +// +// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements. template struct RotatingMemWrapper { @@ -24,15 +38,18 @@ struct RotatingMemWrapper size_a(size_a_), size_b(size_b_) { + // Store original buffer pointers as first entry p_a_grids.push_back(a_ptr); p_b_grids.push_back(b_ptr); + + // Create (rotating_count - 1) additional copies at different memory addresses for(size_t i = 1; i < rotating_count; i++) { { void* pADeviceBuf; HIP_CHECK_ERROR(hipMalloc(static_cast(&pADeviceBuf), size_a_)); - HIP_CHECK_ERROR(hipMemcpy(static_cast(pADeviceBuf), - const_cast(p_a_grids[0]), + HIP_CHECK_ERROR(hipMemcpy(static_cast(pADeviceBuf), // target buffer + const_cast(p_a_grids[0]), // source buffer size_a_, hipMemcpyDeviceToDevice)); p_a_grids.push_back(pADeviceBuf); @@ -41,19 +58,21 @@ struct RotatingMemWrapper { void* pBDeviceBuf; HIP_CHECK_ERROR(hipMalloc(static_cast(&pBDeviceBuf), size_b_)); - HIP_CHECK_ERROR(hipMemcpy(static_cast(pBDeviceBuf), - const_cast(p_b_grids[0]), + HIP_CHECK_ERROR(hipMemcpy(static_cast(pBDeviceBuf), // target buffer + const_cast(p_b_grids[0]), // source buffer size_b_, hipMemcpyDeviceToDevice)); p_b_grids.push_back(pBDeviceBuf); } } } + // Rotate to the next buffer copy. Call this before each kernel run to use different + // memory addresses, forcing the GPU to fetch data from HBM instead of cache. void Next() { if(rotating_count > 1) { - std::size_t idx = iter++ % rotating_count; + std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies a_ptr = p_a_grids[idx]; b_ptr = p_b_grids[idx]; } @@ -63,15 +82,16 @@ struct RotatingMemWrapper std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b << ", rotating_count: " << rotating_count << "}" << std::endl; } + // Cleanup: Free all extra buffer copies (keeping original) and restore original pointers ~RotatingMemWrapper() noexcept { if(rotating_count > 1) { - // restore ptr + // Restore original buffer pointers a_ptr = p_a_grids[0]; b_ptr = p_b_grids[0]; - // free device mem + // Free extra buffer copies (index 0 is the original, don't free it) for(size_t i = 1; i < rotating_count; i++) { ck_tile::hip_check_error(hipFree(const_cast(p_a_grids[i]))); @@ -94,7 +114,12 @@ inline void flush_icache() { hipDeviceProp_t deviceProps; HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0)); - int32_t gpu_block3 = deviceProps.multiProcessorCount * 60; + + // Over-provision blocks to ensure all CUs execute the flush instruction. + // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage. + // 60x over-provisioning provides statistical certainty that every CU gets at least one block. + constexpr int32_t blocks_per_cu = 60; + int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu; ck_tile::flush_cache<<>>(); HIP_CHECK_ERROR(hipGetLastError()); diff --git a/include/ck_tile/ops/common/streamk_common.hpp b/include/ck_tile/ops/common/streamk_common.hpp index 5dbe6223c4..c01e967dcd 100644 --- a/include/ck_tile/ops/common/streamk_common.hpp +++ b/include/ck_tile/ops/common/streamk_common.hpp @@ -11,4 +11,33 @@ enum StreamKReductionStrategy : uint32_t Atomic = 0u, Reduction = 1u }; + +/** + * @brief Estimates the number of Stream-K workgroups per macro tile in the C tensor. + * + * @param sk_ctas Number of Stream-K workgroups. + * @param iters_per_sk_cta Number of iterations per Stream-K workgroup. + * @param iters_per_tile Number of iterations per tile (i.e., the number of macro tiles in the K + * dimension). + * @return ck_tile::index_t An estimate of the number of workgroups per macro tile in the C tensor. + * @note It is assumed that `iters_per_sk_cta` > 0. + */ +template +ck_tile::index_t +estimate_num_wgs_per_tile(index_t sk_ctas, index_t iters_per_sk_cta, index_t iters_per_tile) +{ + // In the case of non-atomic reduction or data-parallel only, there will always be 1 workgroup + // writing final results to a given macro tile in C. + int num_wgs_per_tile = 1; + + // Otherwise, for atomics, multiple workgroups may be writing to the same macro tile in C. + if(sk_ctas > 0 && ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic) + { + // Estimate the number of workgroups per macro tile. + num_wgs_per_tile = + (iters_per_tile / iters_per_sk_cta) + ((iters_per_tile % iters_per_sk_cta) != 0); + } + + return std::max(num_wgs_per_tile, 1); +} } // namespace ck_tile diff --git a/include/ck_tile/ops/gemm.hpp b/include/ck_tile/ops/gemm.hpp index 6b587f81d5..e1026485d7 100644 --- a/include/ck_tile/ops/gemm.hpp +++ b/include/ck_tile/ops/gemm.hpp @@ -33,9 +33,10 @@ #include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp" #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp" -#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp" #include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp" #include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp" +#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp" +#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp" #include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp" #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp" diff --git a/include/ck_tile/remod.py b/include/ck_tile/remod.py index bd940036bd..a8ff2defe5 100644 --- a/include/ck_tile/remod.py +++ b/include/ck_tile/remod.py @@ -86,8 +86,12 @@ class submodule_t: submodule = submodule_t() # formatting for x in all_files: - subprocess.Popen(f"dos2unix -n {str(x)}", shell=True) - cmd = f"clang-format-18 -style=file -i {str(x)}" + subprocess.Popen( + f"python -m dos2unix {str(x)} {str(x)}", + shell=True, + stdout=open(os.devnull, "wb"), + ) + cmd = f"clang-format -style=file -i {str(x)}" # for xp in x.parents: # print(get_file_base(x)) subprocess.Popen(cmd, shell=True) diff --git a/script/install_precommit.sh b/script/install_precommit.sh index fd1840290e..545dcfa666 100755 --- a/script/install_precommit.sh +++ b/script/install_precommit.sh @@ -13,9 +13,6 @@ echo "I: Creating and activating virtual environment for pre-commit..." python3 -m venv "$(dirname "$0")/../.venv" source "$(dirname "$0")/../.venv/bin/activate" -echo "I: Installing tools required for pre-commit checks..." -run_and_check pip install dos2unix -run_and_check pip install clang-format==18.1.3 echo "I: Installing pre-commit in virtual environment..." run_and_check pip install pre-commit run_and_check pre-commit install diff --git a/script/remod_for_ck_tile.py b/script/remod_for_ck_tile.py new file mode 100755 index 0000000000..7601c9d619 --- /dev/null +++ b/script/remod_for_ck_tile.py @@ -0,0 +1,13 @@ +import os + +root_dir = os.getcwd() +ck_tile_include = root_dir + "/include/ck_tile" +ck_tile_example = root_dir + "/example/ck_tile" + +# Run for include +os.chdir(ck_tile_include) +_ = os.system("python remod.py") + +# Run for example +os.chdir(ck_tile_example) +_ = os.system("python remod.py") diff --git a/script/remod_for_ck_tile.sh b/script/remod_for_ck_tile.sh deleted file mode 100755 index 7b99ec60bd..0000000000 --- a/script/remod_for_ck_tile.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -# Copyright © Advanced Micro Devices, Inc., or its affiliates. -# SPDX-License-Identifier: MIT - -# Run remod.py in both required locations -(cd include/ck_tile/ && python3 remod.py) -(cd example/ck_tile/ && python3 remod.py) diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp index da0b8d153d..c341789435 100644 --- a/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp +++ b/test/ck_tile/gemm_streamk/test_gemm_streamk.hpp @@ -10,6 +10,7 @@ #include #include "ck_tile/host.hpp" +#include "ck_tile/ops/common.hpp" #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" @@ -50,10 +51,10 @@ class TestCkTileStreamK : public ::testing::Test bool PadK = true, bool Preshuffle = false, bool TransposeC = false> - bool invoke_streamk(const ck_tile::StreamKHostArgs& args, - const ck_tile::stream_config& s, - int num_cu, - int occupancy) + std::tuple invoke_streamk(const ck_tile::StreamKHostArgs& args, + const ck_tile::stream_config& s, + int num_cu, + int occupancy) { constexpr bool kPadM = PadM; constexpr bool kPadN = PadN; @@ -129,7 +130,7 @@ class TestCkTileStreamK : public ::testing::Test if(!Kernel::IsSupportedArgument(kargs)) { - return false; + return std::tuple{false, -1}; } dim3 grid_dims = Kernel::GridSize(kargs.tile_partitioner); @@ -138,7 +139,16 @@ class TestCkTileStreamK : public ::testing::Test ck_tile::launch_kernel( s, ck_tile::make_kernel(Kernel{}, grid_dims, block_dims, 0, kargs)); - return true; + ck_tile::index_t num_accumulations_per_tile = + ck_tile::estimate_num_wgs_per_tile( + kargs.tile_partitioner.sk_num_blocks, + // k_iters_per_big_block could be 1, which indicates that all blocks are + // big and each does one iteration. Thus, we ensure the value passed in is at + // least 1 to avoid division by zero errors. + ck_tile::max(kargs.tile_partitioner.k_iters_per_big_block - 1, 1u), + kargs.tile_partitioner.k_iters_per_tile.get()); + + return std::tuple{true, num_accumulations_per_tile}; }; return Run(ck_tile::integral_constant( - args, ck_tile::stream_config{nullptr, false, 0, 0, 1}, num_cu, occupancy)) + const auto [is_valid_instance, num_accumulations_per_tile] = + invoke_streamk( + args, ck_tile::stream_config{nullptr, false, 0, 0, 1}, num_cu, occupancy); + + if(!is_valid_instance) { GTEST_SKIP() << "Skipping this test: The kernel cannot solve the problem\n"; } @@ -256,7 +269,7 @@ class TestCkTileStreamK : public ::testing::Test const float max_accumulated_value = *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end()); const auto rtol_atol = calculate_rtol_atol( - K, /*kbatch*/ 1, max_accumulated_value); + K, num_accumulations_per_tile, max_accumulated_value); bool pass = ck_tile::check_err(c_m_n_dev_result, c_m_n_host_ref,