mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 14:11:29 +00:00
Fix direct lds load for gfx950 and clang20 (#2346)
* fix direct lds load for gfx950 and clang20 * Update include/ck/utility/amd_buffer_addressing_builtins.hpp * Fix format --------- Co-authored-by: Aviral Goel <aviral.goel@amd.com> Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
This commit is contained in:
@@ -402,7 +402,7 @@ __device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::typ
|
||||
tmp.template AsType<half2_t>()[i]);
|
||||
});
|
||||
}
|
||||
#if defined(__gfx942__) || defined(__gfx950__)
|
||||
#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
|
||||
else if constexpr(is_same<T, bhalf_t>::value)
|
||||
{
|
||||
vector_type<bhalf_t, N> tmp{src_thread_data};
|
||||
@@ -838,10 +838,18 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
const bool is_valid,
|
||||
const index_t src_element_space_size)
|
||||
{
|
||||
// Direct loads require that each thread reads and writes exactly a single DWORD.
|
||||
constexpr auto dword_bytes = 4;
|
||||
// Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
|
||||
// For gfx950: supports 1, 3, or 4 DWORDs per thread
|
||||
// For gfx942: supports exactly 1 DWORD per thread
|
||||
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
|
||||
#if defined(__gfx950__)
|
||||
constexpr auto dword_bytes = 4;
|
||||
static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
|
||||
bytes_per_thread == dword_bytes * 4);
|
||||
#elif defined(__gfx942__)
|
||||
constexpr auto dword_bytes = 4;
|
||||
static_assert(bytes_per_thread == dword_bytes);
|
||||
#endif
|
||||
|
||||
const int32x4_t src_resource =
|
||||
make_wave_buffer_resource(global_base_ptr, src_element_space_size);
|
||||
@@ -872,7 +880,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
#endif
|
||||
|
||||
llvm_amdgcn_raw_buffer_load_lds(
|
||||
src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
|
||||
src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user