Fix direct lds load for gfx950 and clang20 (#2346)

* fix direct lds load for gfx950 and clang20

* Update include/ck/utility/amd_buffer_addressing_builtins.hpp

* Fix format

---------

Co-authored-by: Aviral Goel <aviral.goel@amd.com>
Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com>
This commit is contained in:
Illia Silin
2025-06-15 15:22:34 -07:00
committed by GitHub
parent 56f654a826
commit 2d8a804152

View File

@@ -402,7 +402,7 @@ __device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::typ
tmp.template AsType<half2_t>()[i]);
});
}
#if defined(__gfx942__) || defined(__gfx950__)
#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
else if constexpr(is_same<T, bhalf_t>::value)
{
vector_type<bhalf_t, N> tmp{src_thread_data};
@@ -838,10 +838,18 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
const bool is_valid,
const index_t src_element_space_size)
{
// Direct loads require that each thread reads and writes exactly a single DWORD.
constexpr auto dword_bytes = 4;
// Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
// For gfx950: supports 1, 3, or 4 DWORDs per thread
// For gfx942: supports exactly 1 DWORD per thread
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
#if defined(__gfx950__)
constexpr auto dword_bytes = 4;
static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
bytes_per_thread == dword_bytes * 4);
#elif defined(__gfx942__)
constexpr auto dword_bytes = 4;
static_assert(bytes_per_thread == dword_bytes);
#endif
const int32x4_t src_resource =
make_wave_buffer_resource(global_base_ptr, src_element_space_size);
@@ -872,7 +880,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
#endif
llvm_amdgcn_raw_buffer_load_lds(
src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#endif
}
#endif