mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
Improve the performance of qr_ks_vs_whole_k_prefetch pipeline (#6209) ## About qr_ks_vs_whole_k_prefetch pipeline This PR updates and enhances the qr_ks_vs_whole_k_prefetch pipeline to improve performance on both MI350 GPUs through better MFMA instruction usage, transposed V-loading support, and N0-loop implementation. The pipeline targets scenarios where the number of workgroups is low, enabling better CU occupancy by using smaller MTile sizes (kM0=64 vs 128) while prefetching entire K tiles. ## Changes: - Adds transposed V-loading support (qr_ks_vs_whole_k_prefetch_trload) to avoid using shuffle instructions on MI350 - Implements N0-loop based Gemm0 to reduce tile window movement overhead and eliminate `clear_tile` calls - Adds full support for hdim96/hdim160 without padding requirements - Updates MFMA instruction selection to ensure optimal choices for MI350 ## Performance results 1. For attention shapes which leads to kM0=64, `qr_ks_vs_async_whole_k_prefetch_trload` shows much better performance than `qr_ks_vs_async_trload` on the same case (execution time `41.02ms` by whole_k_prefetch_trload & `58.50ms` by async_load), and `qr_ks_vs_async_whole_k_prefetch_trload` also shows obviously better performance than the recently tuned `qr_ks_vs_async` on the same case (execution time `41.02ms` by whole_k_prefetch_trload 7 `47.60ms` by qr_ks_vs_async) 2. Also on MI300, for attention shapes which leads to kM0=64, `qr_ks_vs_async_whole_k_prefetch` shows much better performance than the `qr_ks_vs_async` (which is supposed to be very high-efficient) on the same case (execution time `64.50ms` by whole_k_prefetch & `80.20ms` by qr_ks_vs_async) 3. For attention shapes which leads to kM0=128, `qr_ks_vs_async_whole_k_prefetch_trload` show a little bit better performance than `qr_ks_vs_async` on mi350 (execution time `104.50ms` by whole_k_prefetch_trload & `106.50ms` by qr_ks_vs_async). And they shows completely on-par performance on MI300 ## Test/Verify 1. Use the ROCM xformers branch `test_whole_k_prefetch_n0loop` to test/verify qr_ks_vs_whole_k_prefetch pipeline since this pipeline can not be used by ck_tile fmha example so far 2. Use the following command-line for building/testing xformers >```bash > #> git clone -b test_whole_k_prefetch_n0loop https://github.com/ROCm/xformers > #> git submodule update --init --recursive > #> pip install --no-build-isolation -e ./ > #> pytest tests/test_mem_eff_attention.py::test_forward >``` 4. Any scripts which can run on xformers can be used to evaluate qr_ks_vs_whole_k_prefetch pipeline. Using the two environ variable to switch from using different pipelines > ```bash > #> export FMHA_DISABLE_SPECIAL_TREATMENT=1 #> to disable using FAV3 and qr_ks_vs_async_trload pipeline > #> export FMHA_ENABLE_ASYNC_PIPELINE=1 #> to disable using qr_ks_vs_async pipeline for comparing > ``` ## Discussion
134 lines
5.2 KiB
C++
134 lines
5.2 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include "ck_tile/core/config.hpp"
|
|
#include "ck_tile/host/hip_check_error.hpp"
|
|
#include <hip/hip_runtime.h>
|
|
#include <iostream>
|
|
|
|
namespace ck_tile {
|
|
|
|
// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
|
|
//
|
|
// Purpose:
|
|
// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
|
|
// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
|
|
// timing measurements. This wrapper rotates through multiple copies of buffers at different
|
|
// memory addresses to force cache misses.
|
|
//
|
|
// How it works:
|
|
// Constructor: Creates rotating_count copies of matrices A and B in GPU memory
|
|
// Next(): Switches pointers to the next buffer copy (cycles through all copies)
|
|
// Destructor: Frees extra buffer copies and restores original pointers
|
|
//
|
|
// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
|
|
template <typename ADataType, typename BDataType>
|
|
struct RotatingMemWrapper
|
|
{
|
|
RotatingMemWrapper() = delete;
|
|
RotatingMemWrapper(const void* a_ptr_,
|
|
const void* b_ptr_,
|
|
std::size_t rotating_count_hint,
|
|
std::size_t size_a_,
|
|
std::size_t size_b_)
|
|
: a_ptr(a_ptr_),
|
|
b_ptr(b_ptr_),
|
|
rotating_count(rotating_count_hint),
|
|
size_a(size_a_),
|
|
size_b(size_b_)
|
|
{
|
|
// Store original buffer pointers as first entry
|
|
p_a_grids.push_back(a_ptr);
|
|
p_b_grids.push_back(b_ptr);
|
|
|
|
// limit the rotating count to prevent oom
|
|
const uint64_t footprint = (size_a + size_b);
|
|
const uint64_t max_rotating_count = (1ULL << 31) / footprint;
|
|
rotating_count = std::min(rotating_count, max_rotating_count);
|
|
|
|
// Create (rotating_count - 1) additional copies at different memory addresses
|
|
for(size_t i = 1; i < rotating_count; i++)
|
|
{
|
|
{
|
|
void* pADeviceBuf;
|
|
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
|
|
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
|
|
const_cast<void*>(p_a_grids[0]), // source buffer
|
|
size_a_,
|
|
hipMemcpyDeviceToDevice));
|
|
p_a_grids.push_back(pADeviceBuf);
|
|
}
|
|
|
|
{
|
|
void* pBDeviceBuf;
|
|
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
|
|
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
|
|
const_cast<void*>(p_b_grids[0]), // source buffer
|
|
size_b_,
|
|
hipMemcpyDeviceToDevice));
|
|
p_b_grids.push_back(pBDeviceBuf);
|
|
}
|
|
}
|
|
}
|
|
// Rotate to the next buffer copy. Call this before each kernel run to use different
|
|
// memory addresses, forcing the GPU to fetch data from HBM instead of cache.
|
|
void Next()
|
|
{
|
|
if(rotating_count > 1)
|
|
{
|
|
std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
|
|
a_ptr = p_a_grids[idx];
|
|
b_ptr = p_b_grids[idx];
|
|
}
|
|
}
|
|
void Print()
|
|
{
|
|
std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
|
|
<< ", rotating_count: " << rotating_count << "}" << std::endl;
|
|
}
|
|
// Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
|
|
~RotatingMemWrapper() noexcept
|
|
{
|
|
if(rotating_count > 1)
|
|
{
|
|
// Restore original buffer pointers
|
|
a_ptr = p_a_grids[0];
|
|
b_ptr = p_b_grids[0];
|
|
|
|
// Free extra buffer copies (index 0 is the original, don't free it)
|
|
for(size_t i = 1; i < rotating_count; i++)
|
|
{
|
|
ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
|
|
ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
const void* a_ptr;
|
|
const void* b_ptr;
|
|
std::size_t iter = 0;
|
|
std::size_t rotating_count = 1;
|
|
std::size_t size_a = 0;
|
|
std::size_t size_b = 0;
|
|
std::vector<const void*> p_a_grids;
|
|
std::vector<const void*> p_b_grids;
|
|
};
|
|
inline void flush_icache()
|
|
{
|
|
hipDeviceProp_t deviceProps;
|
|
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
|
|
|
|
// Over-provision blocks to ensure all CUs execute the flush instruction.
|
|
// With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
|
|
// 60x over-provisioning provides statistical certainty that every CU gets at least one block.
|
|
constexpr int32_t blocks_per_cu = 60;
|
|
int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
|
|
|
|
ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
|
|
HIP_CHECK_ERROR(hipGetLastError());
|
|
}
|
|
} // namespace ck_tile
|