From 1810074bd78b415caef0fc8f33040936e9d26008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Mon, 16 Jun 2025 17:32:50 +0200 Subject: [PATCH] [buffer store/load] Replace memcpy with __builtin_memcpy in cast_to_amdgpu_buffer_rsrc_t We used memcpy to implement a bitcast of the opaque type amdgcn_buffer_rsrc. However, hip's implementation of memcpy did not allow the compiler to infer that the result of the copy of a uniform value was also uniform. This resulted in a waterfall loop over every value that the copy could take (and a loss in performance). When we use __builtin_memcpy, the optimizer correctly handles the uniform copy. Solves SWDEV-537500 --- include/ck_tile/core/arch/amd_buffer_addressing.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 7111eed596..8a6ff99a83 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -92,7 +92,7 @@ CK_TILE_DEVICE __amdgpu_buffer_rsrc_t cast_to_amdgpu_buffer_rsrc_t(int32x4_t res { __amdgpu_buffer_rsrc_t as_rsrc; static_assert(sizeof(res) == sizeof(as_rsrc) && "Size of buffer resource should match"); - memcpy(&as_rsrc, &res, sizeof(res)); + __builtin_memcpy(&as_rsrc, &res, sizeof(res)); return as_rsrc; } #endif