From 723dd9813e2880ecc7848bc81328c73c99fa8c86 Mon Sep 17 00:00:00 2001 From: carlushuang Date: Sat, 22 Jun 2024 00:00:13 +0800 Subject: [PATCH] WA for rocm-6.2+ s constrait for buffer resource (#1346) * WA for rocm-6.2+ s constrait for buffer resource * add missing memory clobber [ROCm/composable_kernel commit: fa129c1a5db62354c4b39857d2b1598bb618f8ce] --- include/ck/utility/amd_buffer_addressing.hpp | 3 ++- include/ck_tile/core/arch/amd_buffer_addressing.hpp | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index cfa4cabee5..ab22134fc6 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -991,7 +991,8 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, asm volatile("s_mov_b32 m0, %0; \n\t" "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), "v"(global_offset_bytes), - "s"(src_resource)); + "s"(src_resource) + : "memory"); #else // LDS pointer must be attributed with the LDS address space. __attribute__((address_space(3))) uint32_t* lds_ptr = diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 13e92ef0bb..2cd8bb5f01 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -26,7 +26,12 @@ struct __attribute__((packed)) buffer_resource CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t size = 0xffffffff) { buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD}; - return __builtin_bit_cast(int32x4_t, res); + int32x4_t r = __builtin_bit_cast(int32x4_t, res); + r.x = __builtin_amdgcn_readfirstlane(r.x); + r.y = __builtin_amdgcn_readfirstlane(r.y); + r.z = __builtin_amdgcn_readfirstlane(r.z); + r.w = __builtin_amdgcn_readfirstlane(r.w); + return r; } namespace impl { @@ -2104,7 +2109,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, asm volatile("s_mov_b32 m0, %0; \n\t" "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), "v"(global_offset_bytes), - "s"(src_resource)); + "s"(src_resource) + : "memory"); #else // LDS pointer must be attributed with the LDS address space. __attribute__((address_space(3))) uint32_t* lds_ptr =