mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
[CK] s_prefetch unit test fixes.
Signed-off-by: Michal Kulikowski <Michal.Kulikowski@amd.com>
This commit is contained in:
committed by
Michał Kulikowski
parent
f3ef7acca0
commit
cd8af997e6
@@ -1,54 +1,44 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#if __clang_major__ >= 20
|
||||
#include "ck/utility/amd_buffer_addressing_builtins.hpp"
|
||||
#else
|
||||
#include "ck/utility/amd_buffer_addressing.hpp"
|
||||
#endif
|
||||
|
||||
#include "s_prefetch_op_util.hpp"
|
||||
|
||||
template <typename T>
|
||||
bool run_test()
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
|
||||
bool run_test(bool time_kernels)
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
const auto s_prefetch_kernel = ck::s_prefetch_op_util::kernel_with_scalar_prefetch<T>;
|
||||
const auto s_buffer_prefetch_kernel =
|
||||
ck::s_prefetch_op_util::kernel_with_scalar_buffer_prefetch<T>;
|
||||
const auto s_prefetch_kernel =
|
||||
ck::s_prefetch_op_util::kernel_with_prefetch<T,
|
||||
NUM_THREADS,
|
||||
NUM_SCALARS,
|
||||
ck::s_prefetch_op_util::SPrefetchDataOp<T>>;
|
||||
const auto s_buffer_prefetch_kernel = ck::s_prefetch_op_util::kernel_with_prefetch<
|
||||
T,
|
||||
NUM_THREADS,
|
||||
NUM_SCALARS,
|
||||
ck::s_prefetch_op_util::SBufferPrefetchDataOp<T, NUM_SCALARS>>;
|
||||
|
||||
const auto prefetch_kernel_container =
|
||||
std::make_tuple(s_prefetch_kernel, s_buffer_prefetch_kernel);
|
||||
|
||||
ck::static_for<0, 2, 1>{}([&](auto i) {
|
||||
std::string kernel_name = (i == 1 ? "s_buffer_prefetch" : "s_prefetch");
|
||||
pass &= ck::s_prefetch_op_util::test_constant_prefetch_impl<
|
||||
decltype(std::get<ck::Number<i>{}>(prefetch_kernel_container)),
|
||||
T>(std::get<ck::Number<i>{}>(prefetch_kernel_container), kernel_name);
|
||||
|
||||
auto kernel = std::get<ck::Number<i>{}>(prefetch_kernel_container);
|
||||
|
||||
pass &= ck::s_prefetch_op_util::
|
||||
test_prefetch_impl<decltype(kernel), T, NUM_THREADS, NUM_SCALARS>(
|
||||
time_kernels, kernel, kernel_name);
|
||||
});
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
int main(int, char*[])
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(!ck::is_gfx12_supported())
|
||||
{
|
||||
@@ -56,13 +46,20 @@ int main(int, char*[])
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool time_kernels = false;
|
||||
|
||||
if(argc == 2)
|
||||
{
|
||||
time_kernels = std::stoi(argv[1]);
|
||||
}
|
||||
|
||||
bool pass = true;
|
||||
|
||||
std::cout << "=== Testing Constant Cache Prefetch ===" << std::endl;
|
||||
|
||||
// Test different data types
|
||||
pass &= run_test<float>();
|
||||
pass &= run_test<double>();
|
||||
pass &= run_test<float, 4096, 1024>(time_kernels);
|
||||
pass &= run_test<double, 4096, 512>(time_kernels);
|
||||
|
||||
std::cout << "TestConstantPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
|
||||
return pass ? 0 : 1;
|
||||
|
||||
@@ -1,133 +1,123 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/host_utility/hip_check_error.hpp"
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck {
|
||||
namespace s_prefetch_op_util {
|
||||
|
||||
// Prefetch to constant cache using AMD builtin with chunks_to_prefetch(1..32: 1 chunk = 128B)
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void prefetch_to_constant_cache(const T* addr,
|
||||
unsigned int chunks_to_prefetch)
|
||||
// Enable scalar prefetch in hardware (required on gfx12 before using s_prefetch)
|
||||
__device__ __forceinline__ void enable_scalar_prefetch()
|
||||
{
|
||||
#if defined(__gfx12__)
|
||||
assert(chunks_to_prefetch > 0 && chunks_to_prefetch <= 32);
|
||||
__builtin_amdgcn_s_prefetch_data(addr, chunks_to_prefetch - 1); // we need to pass 0..31
|
||||
#else
|
||||
// ignore - not supported
|
||||
(void)addr;
|
||||
(void)chunks_to_prefetch;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Prefetch to constant cache using AMD builtin with chunks_to_prefetch(1..32: 1 chunk = 128B)
|
||||
template <unsigned int offset>
|
||||
__device__ __forceinline__ void prefetch_to_constant_cache(__amdgpu_buffer_rsrc_t buf_res,
|
||||
unsigned int chunks_to_prefetch)
|
||||
{
|
||||
#if defined(__gfx12__)
|
||||
assert(chunks_to_prefetch > 0 && chunks_to_prefetch <= 32);
|
||||
__builtin_amdgcn_s_buffer_prefetch_data(buf_res, offset, chunks_to_prefetch - 1);
|
||||
#else
|
||||
// ignore - not supported
|
||||
(void)buf_res;
|
||||
(void)chunks_to_prefetch;
|
||||
// SCALAR_PREFETCH_EN is bit 24 in MODE register (hwreg 1)
|
||||
// Set 1 bit at offset 24 to value 1
|
||||
__builtin_amdgcn_s_setreg(1 | (24 << 6), 1); // Set bit to 1
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void kernel_with_scalar_prefetch(const T* src,
|
||||
T* dst,
|
||||
const void CK_CONSTANT_ADDRESS_SPACE* scalar_data,
|
||||
index_t num_elements,
|
||||
index_t num_scalars)
|
||||
struct SPrefetchDataOp
|
||||
{
|
||||
index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// Prefetch to constant cache using AMD builtin with cachelines to prefetch(1..32)
|
||||
__device__ __forceinline__ void operator()(const T CK_CONSTANT_ADDRESS_SPACE* addr,
|
||||
unsigned int num_cachelines) const
|
||||
{
|
||||
#if defined(__gfx12__)
|
||||
assert(num_cachelines > 0 && num_cachelines <= 32);
|
||||
__builtin_amdgcn_s_prefetch_data(addr, num_cachelines - 1); // we need to pass 0..31
|
||||
#else
|
||||
// ignore - not supported
|
||||
(void)addr;
|
||||
(void)num_cachelines;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
const T CK_CONSTANT_ADDRESS_SPACE* scalar_elems =
|
||||
static_cast<const T CK_CONSTANT_ADDRESS_SPACE*>(scalar_data);
|
||||
template <typename T, uint32_t NUM_SCALARS>
|
||||
struct SBufferPrefetchDataOp
|
||||
{
|
||||
// Prefetch to constant cache using AMD builtin with cachelines to prefetch(1..32)
|
||||
__device__ __forceinline__ void operator()(const T CK_CONSTANT_ADDRESS_SPACE* addr,
|
||||
unsigned int num_cachelines) const
|
||||
{
|
||||
#if defined(__gfx12__)
|
||||
__amdgpu_buffer_rsrc_t buf_res = make_wave_buffer_resource_new(addr, NUM_SCALARS);
|
||||
assert(num_cachelines > 0 && num_cachelines <= 32);
|
||||
__builtin_amdgcn_s_buffer_prefetch_data(buf_res, 0, num_cachelines - 1);
|
||||
#else
|
||||
// ignore - not supported
|
||||
(void)addr;
|
||||
(void)num_cachelines;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calculate number of 128B chunks needed to cover num_scalars elements
|
||||
constexpr index_t chunk_size_bytes = 128;
|
||||
constexpr index_t elements_per_chunk = chunk_size_bytes / sizeof(T);
|
||||
unsigned int chunks_needed = (num_scalars + elements_per_chunk - 1) / elements_per_chunk;
|
||||
template <typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS, typename PrefetchOp>
|
||||
__global__ void kernel_with_prefetch(const T* src,
|
||||
T* dst,
|
||||
const T CK_CONSTANT_ADDRESS_SPACE* scalar_data,
|
||||
bool enable_prefetch)
|
||||
{
|
||||
uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Prefetch all scalar data at once using chunks parameter
|
||||
// Calculate number of 128B cachelines needed to cover num_scalars elements
|
||||
constexpr index_t cachelineSize = 128;
|
||||
constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T);
|
||||
constexpr unsigned int cachelinesNeeded =
|
||||
(NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize;
|
||||
|
||||
// Prefetch all scalar data at once
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
prefetch_to_constant_cache(scalar_elems, chunks_needed);
|
||||
if(enable_prefetch)
|
||||
{
|
||||
enable_scalar_prefetch();
|
||||
}
|
||||
PrefetchOp{}(scalar_data, cachelinesNeeded);
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
if(tid < num_elements)
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
sum = src[tid]; // load from global mem to make sure prefetch finished
|
||||
sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to
|
||||
// finishs
|
||||
}
|
||||
__syncthreads(); // waits on loads from global mem
|
||||
if(tid < num_elements)
|
||||
if(tid < NUM_THREADS)
|
||||
{
|
||||
// Access prefetched scalar data
|
||||
for(index_t i = 0; i < num_scalars; i++)
|
||||
for(uint32_t i = 0; i < NUM_SCALARS; i++)
|
||||
{
|
||||
sum += scalar_elems[i]; // should be fast due to scalars being preloaded
|
||||
sum += scalar_data[i]; // should be fast due to scalars being preloaded
|
||||
}
|
||||
|
||||
dst[tid] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
kernel_with_scalar_buffer_prefetch(const T* src,
|
||||
T* dst,
|
||||
const void CK_CONSTANT_ADDRESS_SPACE* scalar_data,
|
||||
index_t num_elements,
|
||||
index_t num_scalars)
|
||||
template <typename PrefetchKernel, typename T, uint32_t NUM_THREADS, uint32_t NUM_SCALARS>
|
||||
bool test_prefetch_impl(bool time_kernels,
|
||||
const PrefetchKernel& prefetch_kernel,
|
||||
const std::string& kernel_name)
|
||||
{
|
||||
index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
const T CK_CONSTANT_ADDRESS_SPACE* scalar_elems =
|
||||
static_cast<const T CK_CONSTANT_ADDRESS_SPACE*>(scalar_data);
|
||||
|
||||
// Calculate number of 128B chunks needed to cover num_scalars elements
|
||||
constexpr index_t chunk_size_bytes = 128;
|
||||
constexpr index_t elements_per_chunk = chunk_size_bytes / sizeof(T);
|
||||
unsigned int chunks_needed = (num_scalars + elements_per_chunk - 1) / elements_per_chunk;
|
||||
|
||||
__amdgpu_buffer_rsrc_t src_wave_buffer_resource =
|
||||
make_wave_buffer_resource_new(scalar_elems, num_scalars);
|
||||
|
||||
// Prefetch all scalar data at once using chunks parameter
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
prefetch_to_constant_cache<0>(src_wave_buffer_resource, chunks_needed);
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
if(tid < num_elements)
|
||||
{
|
||||
sum = src[tid]; // load from global mem to make sure prefetch finished
|
||||
}
|
||||
__syncthreads(); // waits on loads from global mem
|
||||
if(tid < num_elements)
|
||||
{
|
||||
// Access prefetched scalar data
|
||||
for(index_t i = 0; i < num_scalars; i++)
|
||||
{
|
||||
sum += amd_s_buffer_load_impl<T, 1>(
|
||||
src_wave_buffer_resource,
|
||||
i * sizeof(T)); // should be fast due to scalars being preloaded
|
||||
}
|
||||
|
||||
dst[tid] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename PrefetchKernel, typename T>
|
||||
bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel,
|
||||
const std::string& kernel_name)
|
||||
{
|
||||
constexpr index_t num_elements = 512;
|
||||
constexpr index_t num_scalars = 512;
|
||||
// TODO: maybe add more prefetch instructions inside kernel to support more values
|
||||
assert(NUM_SCALARS / sizeof(T) < (128 * 32));
|
||||
constexpr index_t num_elements = NUM_THREADS;
|
||||
constexpr index_t num_scalars = NUM_SCALARS;
|
||||
constexpr index_t block_size = 256;
|
||||
constexpr index_t grid_size = (num_elements + block_size - 1) / block_size;
|
||||
|
||||
@@ -171,14 +161,75 @@ bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel,
|
||||
hipStream_t stream;
|
||||
hip_check_error(hipStreamCreate(&stream));
|
||||
|
||||
prefetch_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||
static_cast<const T*>(d_src.GetDeviceBuffer()),
|
||||
static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(d_scalar.GetDeviceBuffer()),
|
||||
num_elements,
|
||||
num_scalars);
|
||||
if(time_kernels)
|
||||
{
|
||||
ck::static_for<0, 2, 1>{}([&](auto static_i) {
|
||||
constexpr bool prefetch_enabled = static_i == 0;
|
||||
std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl;
|
||||
|
||||
hip_check_error(hipStreamSynchronize(stream));
|
||||
constexpr int num_warmup = 1;
|
||||
constexpr int num_iterations = 10;
|
||||
|
||||
// Warmup runs
|
||||
for(int i = 0; i < num_warmup; i++)
|
||||
{
|
||||
prefetch_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||
static_cast<const T*>(d_src.GetDeviceBuffer()),
|
||||
static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
static_cast<const T*>(d_scalar.GetDeviceBuffer())),
|
||||
prefetch_enabled);
|
||||
}
|
||||
hip_check_error(hipStreamSynchronize(stream));
|
||||
|
||||
// Performance measurement
|
||||
hipEvent_t start, stop;
|
||||
hip_check_error(hipEventCreate(&start));
|
||||
hip_check_error(hipEventCreate(&stop));
|
||||
|
||||
hip_check_error(hipEventRecord(start, stream));
|
||||
for(int i = 0; i < num_iterations; i++)
|
||||
{
|
||||
prefetch_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||
static_cast<const T*>(d_src.GetDeviceBuffer()),
|
||||
static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
static_cast<const T*>(d_scalar.GetDeviceBuffer())),
|
||||
prefetch_enabled);
|
||||
}
|
||||
hip_check_error(hipEventRecord(stop, stream));
|
||||
|
||||
hip_check_error(hipStreamSynchronize(stream));
|
||||
|
||||
float elapsed_ms = 0;
|
||||
hip_check_error(hipEventElapsedTime(&elapsed_ms, start, stop));
|
||||
|
||||
float avg_time_us = (elapsed_ms * 1000.0f) / num_iterations;
|
||||
float total_bytes = (num_elements * sizeof(T) + num_scalars * sizeof(T)); // read
|
||||
float bandwidth_gb_s = (total_bytes / (avg_time_us * 1e-6)) / 1e9;
|
||||
float ops_per_iteration = num_elements * num_scalars; // adds
|
||||
float gflops = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9;
|
||||
|
||||
std::cout << " Performance: " << std::endl;
|
||||
std::cout << " Average kernel time: " << avg_time_us << " us" << std::endl;
|
||||
std::cout << " Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl;
|
||||
std::cout << " Compute throughput: " << gflops << " GFLOPS" << std::endl;
|
||||
|
||||
hip_check_error(hipEventDestroy(start));
|
||||
hip_check_error(hipEventDestroy(stop));
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
prefetch_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||
static_cast<const T*>(d_src.GetDeviceBuffer()),
|
||||
static_cast<T*>(d_dst_with_prefetch_chunks.GetDeviceBuffer()),
|
||||
cast_pointer_to_constant_address_space(
|
||||
static_cast<const T*>(d_scalar.GetDeviceBuffer())),
|
||||
true);
|
||||
|
||||
hip_check_error(hipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
// Copy results back
|
||||
d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data());
|
||||
@@ -186,7 +237,8 @@ bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel,
|
||||
// Verify results
|
||||
bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected);
|
||||
|
||||
std::cout << (pass ? "PASS" : "FAIL") << std::endl;
|
||||
std::cout << " Correctness: " << (pass ? "PASS" : "FAIL") << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
hip_check_error(hipStreamDestroy(stream));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user