Expand the bandwidth of direct_global_to_lds for gfx950 (#2576)

* Expand the bandwidth of direct_global_to_lds for gfx950

* clang-format

* fix the remod.py and script for clang format

---------

Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
This commit is contained in:
Thomas Ning
2025-07-28 23:56:53 -07:00
committed by GitHub
parent 01642ca8b1
commit 9d4b494f07
3 changed files with 33 additions and 13 deletions

View File

@@ -2762,11 +2762,6 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
const bool is_valid,
const index_t src_element_space_size)
{
// Direct loads require that each thread reads and writes exactly a single DWORD.
constexpr auto dword_bytes = 4;
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
static_assert(bytes_per_thread == dword_bytes);
const uint32_t* global_ptr =
reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
const int32x4_t src_resource =
@@ -2783,12 +2778,27 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
"s"(src_resource)
: "memory");
#else
// Direct loads require that each thread reads and writes exactly a single DWORD.
#if defined(__gfx9__)
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
#endif
// Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
// For gfx950: supports 1, 3, or 4 DWORDs per thread
// For gfx942: supports exactly 1 DWORD per thread
#if defined(__gfx950__)
constexpr auto dword_bytes = 4;
static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
bytes_per_thread == dword_bytes * 4);
#elif defined(__gfx9__)
constexpr auto dword_bytes = 4;
static_assert(bytes_per_thread == dword_bytes);
#endif
// LDS pointer must be attributed with the LDS address space.
as3_uint32_ptr lds_ptr =
reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
llvm_amdgcn_raw_buffer_load_lds(
src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#endif
}

View File

@@ -2532,11 +2532,6 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
const bool is_valid,
const index_t src_element_space_size)
{
// Direct loads require that each thread reads and writes exactly a single DWORD.
constexpr auto dword_bytes = 4;
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
static_assert(bytes_per_thread == dword_bytes);
const uint32_t* global_ptr =
reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
const int32x4_t src_resource =
@@ -2553,12 +2548,27 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
"s"(src_resource)
: "memory");
#else
// Direct loads require that each thread reads and writes exactly a single DWORD.
#if defined(__gfx9__)
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
#endif
// Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
// For gfx950: supports 1, 3, or 4 DWORDs per thread
// For gfx942: supports exactly 1 DWORD per thread
#if defined(__gfx950__)
constexpr auto dword_bytes = 4;
static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
bytes_per_thread == dword_bytes * 4);
#elif defined(__gfx9__)
constexpr auto dword_bytes = 4;
static_assert(bytes_per_thread == dword_bytes);
#endif
// LDS pointer must be attributed with the LDS address space.
as3_uint32_ptr lds_ptr =
reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
llvm_amdgcn_raw_buffer_load_lds(
src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#endif
}

View File

@@ -76,7 +76,7 @@ class submodule_t:
gen_header(Path(k) / (f'{km}.hpp'), kv)
else:
gen_header(Path(f'{k}.hpp'), v)
submodule = submodule_t()
# formatting