diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index e626603949..783fc661ce 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -3,7 +3,6 @@ #pragma once #include "data_type.hpp" -#include "amd_inline_asm.hpp" namespace ck { @@ -1065,48 +1064,4 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, } #endif -template -__device__ typename vector_type::type -amd_s_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, - index_t src_wave_addr_offset) -{ - static_assert(N == 4 || N == 8, "wrong! not implemented"); - // TODO: add other variants of s_buffer_load - if constexpr(N == 4) - { - int32_t tmp = - amd_assembly_s_buffer_load_b32(src_wave_buffer_resource, src_wave_addr_offset); - return bit_cast(tmp); - } - else if constexpr(N == 8) - { - int32x2_t tmp = - amd_assembly_s_buffer_load_b64(src_wave_buffer_resource, src_wave_addr_offset); - return bit_cast(tmp); - } -} - -template -__device__ typename vector_type::type -amd_s_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, - index_t src_wave_addr_offset) -{ - static_assert((is_same::value && (N == 1)) || - (is_same::value && (N == 1 || N == 2)) || - (is_same::value && (N == 2 || N == 4)) || - (is_same::value && (N == 2 || N == 4)) || - (is_same::value && (N == 1 || N == 2)) || - (is_same::value && (N == 4 || N == 8)) || - (is_same::value && (N == 4 || N == 8)) || - (is_same::value && (N == 4 || N == 8)) || - (is_same::value && (N == 4 || N == 8)) || - (is_same::value && (N == 4 || N == 8)), - "wrong! not implemented"); - - using r_t = typename vector_type::type; - auto raw_data = - amd_s_buffer_load_impl_raw(src_wave_buffer_resource, src_wave_addr_offset); - return bit_cast(raw_data); -} - } // namespace ck diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp index 06a4ec199d..f642e06050 100644 --- a/include/ck/utility/amd_buffer_addressing_builtins.hpp +++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp @@ -3,7 +3,6 @@ #pragma once #include "data_type.hpp" -#include "amd_inline_asm.hpp" namespace ck { @@ -886,48 +885,4 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, } #endif -template -__device__ typename vector_type::type -amd_s_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, - index_t src_wave_addr_offset) -{ - static_assert(N == 4 || N == 8, "wrong! not implemented"); - // TODO: add other variants of s_buffer_load - if constexpr(N == 4) - { - int32_t tmp = - amd_assembly_s_buffer_load_b32(src_wave_buffer_resource, src_wave_addr_offset); - return bit_cast(tmp); - } - else if constexpr(N == 8) - { - int32x2_t tmp = - amd_assembly_s_buffer_load_b64(src_wave_buffer_resource, src_wave_addr_offset); - return bit_cast(tmp); - } -} - -template -__device__ typename vector_type::type -amd_s_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, - index_t src_wave_addr_offset) -{ - static_assert((is_same::value && (N == 1)) || - (is_same::value && (N == 1 || N == 2)) || - (is_same::value && (N == 2 || N == 4)) || - (is_same::value && (N == 2 || N == 4)) || - (is_same::value && (N == 1 || N == 2)) || - (is_same::value && (N == 4 || N == 8)) || - (is_same::value && (N == 4 || N == 8)) || - (is_same::value && (N == 4 || N == 8)) || - (is_same::value && (N == 4 || N == 8)) || - (is_same::value && (N == 4 || N == 8)), - "wrong! not implemented"); - - using r_t = typename vector_type::type; - auto raw_data = - amd_s_buffer_load_impl_raw(src_wave_buffer_resource, src_wave_addr_offset); - return bit_cast(raw_data); -} - } // namespace ck diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp index e9f9e407d6..79efd77edb 100644 --- a/include/ck/utility/amd_inline_asm.hpp +++ b/include/ck/utility/amd_inline_asm.hpp @@ -432,28 +432,5 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a, } #endif -// s_buffer_loads -inline __device__ int32_t -amd_assembly_s_buffer_load_b32(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, unsigned int offset) -{ - int32_t result; - asm volatile("s_buffer_load_b32 %0, %1, %2" - : "=s"(result) - : "s"(src_wave_buffer_resource), "s"(offset) - : "memory"); - return result; -} - -inline __device__ int32x2_t -amd_assembly_s_buffer_load_b64(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, unsigned int offset) -{ - int32x2_t result; - asm volatile("s_buffer_load_b64 %0, %1, %2" - : "=s"(result) - : "s"(src_wave_buffer_resource), "s"(offset) - : "memory"); - return result; -} - } // namespace ck #endif diff --git a/test/s_prefetch_op/s_prefetch_op.cpp b/test/s_prefetch_op/s_prefetch_op.cpp index 1ec3e57794..fc0ae84132 100644 --- a/test/s_prefetch_op/s_prefetch_op.cpp +++ b/test/s_prefetch_op/s_prefetch_op.cpp @@ -1,54 +1,44 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -#include -#include -#include -#include -#include -#include -#include - #include "ck/ck.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/host_utility/hip_check_error.hpp" #include "ck/host_utility/device_prop.hpp" -#include - -#if __clang_major__ >= 20 -#include "ck/utility/amd_buffer_addressing_builtins.hpp" -#else -#include "ck/utility/amd_buffer_addressing.hpp" -#endif - #include "s_prefetch_op_util.hpp" -template -bool run_test() +template +bool run_test(bool time_kernels) { bool pass = true; - const auto s_prefetch_kernel = ck::s_prefetch_op_util::kernel_with_scalar_prefetch; - const auto s_buffer_prefetch_kernel = - ck::s_prefetch_op_util::kernel_with_scalar_buffer_prefetch; + const auto s_prefetch_kernel = + ck::s_prefetch_op_util::kernel_with_prefetch>; + const auto s_buffer_prefetch_kernel = ck::s_prefetch_op_util::kernel_with_prefetch< + T, + NUM_THREADS, + NUM_SCALARS, + ck::s_prefetch_op_util::SBufferPrefetchDataOp>; const auto prefetch_kernel_container = std::make_tuple(s_prefetch_kernel, s_buffer_prefetch_kernel); ck::static_for<0, 2, 1>{}([&](auto i) { std::string kernel_name = (i == 1 ? "s_buffer_prefetch" : "s_prefetch"); - pass &= ck::s_prefetch_op_util::test_constant_prefetch_impl< - decltype(std::get{}>(prefetch_kernel_container)), - T>(std::get{}>(prefetch_kernel_container), kernel_name); + + auto kernel = std::get{}>(prefetch_kernel_container); + + pass &= ck::s_prefetch_op_util:: + test_prefetch_impl( + time_kernels, kernel, kernel_name); }); return pass; } -int main(int, char*[]) +int main(int argc, char* argv[]) { if(!ck::is_gfx12_supported()) { @@ -56,13 +46,20 @@ int main(int, char*[]) return 0; } + bool time_kernels = false; + + if(argc == 2) + { + time_kernels = std::stoi(argv[1]); + } + bool pass = true; std::cout << "=== Testing Constant Cache Prefetch ===" << std::endl; // Test different data types - pass &= run_test(); - pass &= run_test(); + pass &= run_test(time_kernels); + pass &= run_test(time_kernels); std::cout << "TestConstantPrefetch ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; return pass ? 0 : 1; diff --git a/test/s_prefetch_op/s_prefetch_op_util.hpp b/test/s_prefetch_op/s_prefetch_op_util.hpp index e894baf677..077b876b1a 100644 --- a/test/s_prefetch_op/s_prefetch_op_util.hpp +++ b/test/s_prefetch_op/s_prefetch_op_util.hpp @@ -1,133 +1,123 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/host_utility/hip_check_error.hpp" + +#include + namespace ck { namespace s_prefetch_op_util { -// Prefetch to constant cache using AMD builtin with chunks_to_prefetch(1..32: 1 chunk = 128B) -template -__device__ __forceinline__ void prefetch_to_constant_cache(const T* addr, - unsigned int chunks_to_prefetch) +// Enable scalar prefetch in hardware (required on gfx12 before using s_prefetch) +__device__ __forceinline__ void enable_scalar_prefetch() { #if defined(__gfx12__) - assert(chunks_to_prefetch > 0 && chunks_to_prefetch <= 32); - __builtin_amdgcn_s_prefetch_data(addr, chunks_to_prefetch - 1); // we need to pass 0..31 -#else - // ignore - not supported - (void)addr; - (void)chunks_to_prefetch; -#endif -} - -// Prefetch to constant cache using AMD builtin with chunks_to_prefetch(1..32: 1 chunk = 128B) -template -__device__ __forceinline__ void prefetch_to_constant_cache(__amdgpu_buffer_rsrc_t buf_res, - unsigned int chunks_to_prefetch) -{ -#if defined(__gfx12__) - assert(chunks_to_prefetch > 0 && chunks_to_prefetch <= 32); - __builtin_amdgcn_s_buffer_prefetch_data(buf_res, offset, chunks_to_prefetch - 1); -#else - // ignore - not supported - (void)buf_res; - (void)chunks_to_prefetch; + // SCALAR_PREFETCH_EN is bit 24 in MODE register (hwreg 1) + // Set 1 bit at offset 24 to value 1 + __builtin_amdgcn_s_setreg(1 | (24 << 6), 1); // Set bit to 1 #endif } template -__global__ void kernel_with_scalar_prefetch(const T* src, - T* dst, - const void CK_CONSTANT_ADDRESS_SPACE* scalar_data, - index_t num_elements, - index_t num_scalars) +struct SPrefetchDataOp { - index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + // Prefetch to constant cache using AMD builtin with cachelines to prefetch(1..32) + __device__ __forceinline__ void operator()(const T CK_CONSTANT_ADDRESS_SPACE* addr, + unsigned int num_cachelines) const + { +#if defined(__gfx12__) + assert(num_cachelines > 0 && num_cachelines <= 32); + __builtin_amdgcn_s_prefetch_data(addr, num_cachelines - 1); // we need to pass 0..31 +#else + // ignore - not supported + (void)addr; + (void)num_cachelines; +#endif + } +}; - const T CK_CONSTANT_ADDRESS_SPACE* scalar_elems = - static_cast(scalar_data); +template +struct SBufferPrefetchDataOp +{ + // Prefetch to constant cache using AMD builtin with cachelines to prefetch(1..32) + __device__ __forceinline__ void operator()(const T CK_CONSTANT_ADDRESS_SPACE* addr, + unsigned int num_cachelines) const + { +#if defined(__gfx12__) + __amdgpu_buffer_rsrc_t buf_res = make_wave_buffer_resource_new(addr, NUM_SCALARS); + assert(num_cachelines > 0 && num_cachelines <= 32); + __builtin_amdgcn_s_buffer_prefetch_data(buf_res, 0, num_cachelines - 1); +#else + // ignore - not supported + (void)addr; + (void)num_cachelines; +#endif + } +}; - // Calculate number of 128B chunks needed to cover num_scalars elements - constexpr index_t chunk_size_bytes = 128; - constexpr index_t elements_per_chunk = chunk_size_bytes / sizeof(T); - unsigned int chunks_needed = (num_scalars + elements_per_chunk - 1) / elements_per_chunk; +template +__global__ void kernel_with_prefetch(const T* src, + T* dst, + const T CK_CONSTANT_ADDRESS_SPACE* scalar_data, + bool enable_prefetch) +{ + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; - // Prefetch all scalar data at once using chunks parameter + // Calculate number of 128B cachelines needed to cover num_scalars elements + constexpr index_t cachelineSize = 128; + constexpr index_t elements_per_cachelineSize = cachelineSize / sizeof(T); + constexpr unsigned int cachelinesNeeded = + (NUM_SCALARS + elements_per_cachelineSize - 1) / elements_per_cachelineSize; + + // Prefetch all scalar data at once if(threadIdx.x == 0) { - prefetch_to_constant_cache(scalar_elems, chunks_needed); + if(enable_prefetch) + { + enable_scalar_prefetch(); + } + PrefetchOp{}(scalar_data, cachelinesNeeded); } T sum = 0; - if(tid < num_elements) + if(tid < NUM_THREADS) { - sum = src[tid]; // load from global mem to make sure prefetch finished + sum = src[tid]; // load from global mem to give time for prefetch to finish or be close to + // finishs } __syncthreads(); // waits on loads from global mem - if(tid < num_elements) + if(tid < NUM_THREADS) { // Access prefetched scalar data - for(index_t i = 0; i < num_scalars; i++) + for(uint32_t i = 0; i < NUM_SCALARS; i++) { - sum += scalar_elems[i]; // should be fast due to scalars being preloaded + sum += scalar_data[i]; // should be fast due to scalars being preloaded } dst[tid] = sum; } } -template -__global__ void -kernel_with_scalar_buffer_prefetch(const T* src, - T* dst, - const void CK_CONSTANT_ADDRESS_SPACE* scalar_data, - index_t num_elements, - index_t num_scalars) +template +bool test_prefetch_impl(bool time_kernels, + const PrefetchKernel& prefetch_kernel, + const std::string& kernel_name) { - index_t tid = blockIdx.x * blockDim.x + threadIdx.x; - - const T CK_CONSTANT_ADDRESS_SPACE* scalar_elems = - static_cast(scalar_data); - - // Calculate number of 128B chunks needed to cover num_scalars elements - constexpr index_t chunk_size_bytes = 128; - constexpr index_t elements_per_chunk = chunk_size_bytes / sizeof(T); - unsigned int chunks_needed = (num_scalars + elements_per_chunk - 1) / elements_per_chunk; - - __amdgpu_buffer_rsrc_t src_wave_buffer_resource = - make_wave_buffer_resource_new(scalar_elems, num_scalars); - - // Prefetch all scalar data at once using chunks parameter - if(threadIdx.x == 0) - { - prefetch_to_constant_cache<0>(src_wave_buffer_resource, chunks_needed); - } - - T sum = 0; - if(tid < num_elements) - { - sum = src[tid]; // load from global mem to make sure prefetch finished - } - __syncthreads(); // waits on loads from global mem - if(tid < num_elements) - { - // Access prefetched scalar data - for(index_t i = 0; i < num_scalars; i++) - { - sum += amd_s_buffer_load_impl( - src_wave_buffer_resource, - i * sizeof(T)); // should be fast due to scalars being preloaded - } - - dst[tid] = sum; - } -} - -template -bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel, - const std::string& kernel_name) -{ - constexpr index_t num_elements = 512; - constexpr index_t num_scalars = 512; + // TODO: maybe add more prefetch instructions inside kernel to support more values + assert(NUM_SCALARS / sizeof(T) < (128 * 32)); + constexpr index_t num_elements = NUM_THREADS; + constexpr index_t num_scalars = NUM_SCALARS; constexpr index_t block_size = 256; constexpr index_t grid_size = (num_elements + block_size - 1) / block_size; @@ -171,14 +161,75 @@ bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel, hipStream_t stream; hip_check_error(hipStreamCreate(&stream)); - prefetch_kernel<<>>( - static_cast(d_src.GetDeviceBuffer()), - static_cast(d_dst_with_prefetch_chunks.GetDeviceBuffer()), - cast_pointer_to_constant_address_space(d_scalar.GetDeviceBuffer()), - num_elements, - num_scalars); + if(time_kernels) + { + ck::static_for<0, 2, 1>{}([&](auto static_i) { + constexpr bool prefetch_enabled = static_i == 0; + std::cout << "PREFETCH " << (prefetch_enabled ? "ENABLED!" : "DISABLED!") << std::endl; - hip_check_error(hipStreamSynchronize(stream)); + constexpr int num_warmup = 1; + constexpr int num_iterations = 10; + + // Warmup runs + for(int i = 0; i < num_warmup; i++) + { + prefetch_kernel<<>>( + static_cast(d_src.GetDeviceBuffer()), + static_cast(d_dst_with_prefetch_chunks.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + static_cast(d_scalar.GetDeviceBuffer())), + prefetch_enabled); + } + hip_check_error(hipStreamSynchronize(stream)); + + // Performance measurement + hipEvent_t start, stop; + hip_check_error(hipEventCreate(&start)); + hip_check_error(hipEventCreate(&stop)); + + hip_check_error(hipEventRecord(start, stream)); + for(int i = 0; i < num_iterations; i++) + { + prefetch_kernel<<>>( + static_cast(d_src.GetDeviceBuffer()), + static_cast(d_dst_with_prefetch_chunks.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + static_cast(d_scalar.GetDeviceBuffer())), + prefetch_enabled); + } + hip_check_error(hipEventRecord(stop, stream)); + + hip_check_error(hipStreamSynchronize(stream)); + + float elapsed_ms = 0; + hip_check_error(hipEventElapsedTime(&elapsed_ms, start, stop)); + + float avg_time_us = (elapsed_ms * 1000.0f) / num_iterations; + float total_bytes = (num_elements * sizeof(T) + num_scalars * sizeof(T)); // read + float bandwidth_gb_s = (total_bytes / (avg_time_us * 1e-6)) / 1e9; + float ops_per_iteration = num_elements * num_scalars; // adds + float gflops = (ops_per_iteration / (avg_time_us * 1e-6)) / 1e9; + + std::cout << " Performance: " << std::endl; + std::cout << " Average kernel time: " << avg_time_us << " us" << std::endl; + std::cout << " Effective bandwidth: " << bandwidth_gb_s << " GB/s" << std::endl; + std::cout << " Compute throughput: " << gflops << " GFLOPS" << std::endl; + + hip_check_error(hipEventDestroy(start)); + hip_check_error(hipEventDestroy(stop)); + }); + } + else + { + prefetch_kernel<<>>( + static_cast(d_src.GetDeviceBuffer()), + static_cast(d_dst_with_prefetch_chunks.GetDeviceBuffer()), + cast_pointer_to_constant_address_space( + static_cast(d_scalar.GetDeviceBuffer())), + true); + + hip_check_error(hipStreamSynchronize(stream)); + } // Copy results back d_dst_with_prefetch_chunks.FromDevice(h_dst_with_prefetch_chunks.data()); @@ -186,7 +237,8 @@ bool test_constant_prefetch_impl(const PrefetchKernel& prefetch_kernel, // Verify results bool pass = ck::utils::check_err(h_dst_with_prefetch_chunks, h_expected); - std::cout << (pass ? "PASS" : "FAIL") << std::endl; + std::cout << " Correctness: " << (pass ? "PASS" : "FAIL") << std::endl; + std::cout << std::endl; hip_check_error(hipStreamDestroy(stream));