Merge commit 'b03764ca5a917752845ddbb5da8886051a16d9be' into develop

This commit is contained in:
assistant-librarian[bot]
2025-10-17 17:11:18 +00:00
parent 99ccb97fad
commit f2f7a548cb
15 changed files with 172 additions and 80 deletions

View File

@@ -73,7 +73,7 @@ struct Max
std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
{
return numeric<T>::min();
return numeric<T>::lowest();
};
template <typename T,
@@ -96,7 +96,7 @@ struct AbsMax
std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
{
return numeric<T>::min();
return numeric<T>::lowest();
};
template <typename T,

View File

@@ -6,6 +6,12 @@
#include <hip/hip_runtime.h>
namespace ck_tile {
// GPU kernel to invalidate instruction cache for accurate benchmarking.
// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit,
// forcing subsequent kernel runs to fetch instructions from HBM instead of cache.
// 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel
// exits. Without these NOPs, the flush may not finish, leading to inconsistent
// timing measurements where some instructions remain cached.
static __global__ void flush_cache()
{
asm __volatile__("s_icache_inv \n\t"

View File

@@ -9,6 +9,20 @@
namespace ck_tile {
// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
//
// Purpose:
// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
// timing measurements. This wrapper rotates through multiple copies of buffers at different
// memory addresses to force cache misses.
//
// How it works:
// Constructor: Creates rotating_count copies of matrices A and B in GPU memory
// Next(): Switches pointers to the next buffer copy (cycles through all copies)
// Destructor: Frees extra buffer copies and restores original pointers
//
// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
template <typename ADataType, typename BDataType>
struct RotatingMemWrapper
{
@@ -24,15 +38,18 @@ struct RotatingMemWrapper
size_a(size_a_),
size_b(size_b_)
{
// Store original buffer pointers as first entry
p_a_grids.push_back(a_ptr);
p_b_grids.push_back(b_ptr);
// Create (rotating_count - 1) additional copies at different memory addresses
for(size_t i = 1; i < rotating_count; i++)
{
{
void* pADeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf),
const_cast<void*>(p_a_grids[0]),
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
const_cast<void*>(p_a_grids[0]), // source buffer
size_a_,
hipMemcpyDeviceToDevice));
p_a_grids.push_back(pADeviceBuf);
@@ -41,19 +58,21 @@ struct RotatingMemWrapper
{
void* pBDeviceBuf;
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf),
const_cast<void*>(p_b_grids[0]),
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
const_cast<void*>(p_b_grids[0]), // source buffer
size_b_,
hipMemcpyDeviceToDevice));
p_b_grids.push_back(pBDeviceBuf);
}
}
}
// Rotate to the next buffer copy. Call this before each kernel run to use different
// memory addresses, forcing the GPU to fetch data from HBM instead of cache.
void Next()
{
if(rotating_count > 1)
{
std::size_t idx = iter++ % rotating_count;
std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
a_ptr = p_a_grids[idx];
b_ptr = p_b_grids[idx];
}
@@ -63,15 +82,16 @@ struct RotatingMemWrapper
std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
<< ", rotating_count: " << rotating_count << "}" << std::endl;
}
// Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
~RotatingMemWrapper() noexcept
{
if(rotating_count > 1)
{
// restore ptr
// Restore original buffer pointers
a_ptr = p_a_grids[0];
b_ptr = p_b_grids[0];
// free device mem
// Free extra buffer copies (index 0 is the original, don't free it)
for(size_t i = 1; i < rotating_count; i++)
{
ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
@@ -94,7 +114,12 @@ inline void flush_icache()
{
hipDeviceProp_t deviceProps;
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
// Over-provision blocks to ensure all CUs execute the flush instruction.
// With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
// 60x over-provisioning provides statistical certainty that every CU gets at least one block.
constexpr int32_t blocks_per_cu = 60;
int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
HIP_CHECK_ERROR(hipGetLastError());

View File

@@ -11,4 +11,33 @@ enum StreamKReductionStrategy : uint32_t
Atomic = 0u,
Reduction = 1u
};
/**
* @brief Estimates the number of Stream-K workgroups per macro tile in the C tensor.
*
* @param sk_ctas Number of Stream-K workgroups.
* @param iters_per_sk_cta Number of iterations per Stream-K workgroup.
* @param iters_per_tile Number of iterations per tile (i.e., the number of macro tiles in the K
* dimension).
* @return ck_tile::index_t An estimate of the number of workgroups per macro tile in the C tensor.
* @note It is assumed that `iters_per_sk_cta` > 0.
*/
template <ck_tile::StreamKReductionStrategy ReductionStrategy>
ck_tile::index_t
estimate_num_wgs_per_tile(index_t sk_ctas, index_t iters_per_sk_cta, index_t iters_per_tile)
{
// In the case of non-atomic reduction or data-parallel only, there will always be 1 workgroup
// writing final results to a given macro tile in C.
int num_wgs_per_tile = 1;
// Otherwise, for atomics, multiple workgroups may be writing to the same macro tile in C.
if(sk_ctas > 0 && ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
{
// Estimate the number of workgroups per macro tile.
num_wgs_per_tile =
(iters_per_tile / iters_per_sk_cta) + ((iters_per_tile % iters_per_sk_cta) != 0);
}
return std::max(num_wgs_per_tile, 1);
}
} // namespace ck_tile

View File

@@ -33,9 +33,10 @@
#include "ck_tile/ops/gemm/kernel/gemm_multi_abd_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_multi_d_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp"
#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"

View File

@@ -86,8 +86,12 @@ class submodule_t:
submodule = submodule_t()
# formatting
for x in all_files:
subprocess.Popen(f"dos2unix -n {str(x)}", shell=True)
cmd = f"clang-format-18 -style=file -i {str(x)}"
subprocess.Popen(
f"python -m dos2unix {str(x)} {str(x)}",
shell=True,
stdout=open(os.devnull, "wb"),
)
cmd = f"clang-format -style=file -i {str(x)}"
# for xp in x.parents:
# print(get_file_base(x))
subprocess.Popen(cmd, shell=True)