From 8a7ecfa4bd8ddf1946c5560a8a2f46686f76dc69 Mon Sep 17 00:00:00 2001 From: Tianyuan Wu Date: Thu, 7 Aug 2025 03:54:05 +0000 Subject: [PATCH] Revert "Enable CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT for gfx12" This reverts commit ceee9180070dda0f7f351dad850b075cd53ba433. --- .../core/arch/generic_memory_space_atomic.hpp | 58 +++++++++++++++++++ include/ck_tile/core/config.hpp | 2 +- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp index 07c6aa0baf..c02c46958c 100644 --- a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp +++ b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp @@ -6,6 +6,10 @@ #include "ck_tile/core/numeric/type_convert.hpp" #include "ck_tile/core/container/thread_buffer.hpp" +#define HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN \ + __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2f16) && \ + __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2bf16) + namespace ck_tile { template @@ -32,6 +36,14 @@ CK_TILE_HOST_DEVICE bf16x4_t add_bf16x4_t(const bf16x4_t& a, const bf16x4_t& b) return rtn; } +CK_TILE_HOST_DEVICE fp16x2_t add_f16x2_t(const fp16x2_t& a, const fp16x2_t& b) +{ + fp16x2_t rtn; + rtn[0] = add(a[0], b[0]); + rtn[1] = add(a[1], b[1]); + return rtn; +} + CK_TILE_HOST_DEVICE fp8x4_t add_fp8x4_t(const fp8x4_t& a, const fp8x4_t& b) { fp8x4_t rtn; @@ -304,6 +316,44 @@ CK_TILE_DEVICE void atomic_add(bf8x8_t* p_dst, bf8x8_t const& x) } while(cur_v.u64 != old_v); } +// +// Atomic add for fp16x2_t +// +template <> +CK_TILE_DEVICE void atomic_add(fp16x2_t* p_dst, fp16x2_t const& x) +{ +#if HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN + __builtin_amdgcn_global_atomic_fadd_v2f16(c_style_pointer_cast(p_dst), x); +#else + union U32F162_ADDR + { + uint32_t* u32_a; + fp16x2_t* f162_a; + }; + + union U32F162 + { + uint32_t u32; + fp16x2_t f162; + }; + + U32F162_ADDR dword_addr; + U32F162 cur_v; + U32F162 new_; + uint32_t old_v, new_v; + dword_addr.f162_a = p_dst; + cur_v.u32 = *dword_addr.u32_a; + + do + { + old_v = cur_v.u32; + new_.f162 = add_f16x2_t(cur_v.f162, x); + new_v = new_.u32; + cur_v.u32 = atomicCAS(dword_addr.u32_a, old_v, new_v); + } while(cur_v.u32 != old_v); +#endif +} + template CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer& x) { @@ -311,6 +361,7 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer& x) (std::is_same::value && (N == 1)) || (std::is_same::value && (N == 1 || N == 2)) || (std::is_same::value && (N == 1 || N == 2)) || + (std::is_same::value && (N == 2 || N == 4 || N == 8)) || (std::is_same::value && (N == 2 || N == 4 || N == 8)) || (std::is_same::value && (N == 4 || N == 8 || N == 16)) || (std::is_same::value && (N == 4 || N == 8 || N == 16)), @@ -406,6 +457,13 @@ CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer& x) atomic_add(c_style_pointer_cast(p_dst) + 1, x.template get_as()[I1]); } } + else if constexpr(std::is_same::value) + { + static_for<0, N / 2, 1>{}([&](auto i) { + atomic_add(c_style_pointer_cast(p_dst) + i, + x.template get_as()[i]); + }); + } } template diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 5f83c97968..97c60cf062 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -152,7 +152,7 @@ // buffer atomic add: floating point #ifndef __HIP_DEVICE_COMPILE__ // for host code #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 -#elif defined(__gfx9__) || defined(__gfx12__) // for GPU code +#elif defined(__gfx9__) || defined(__gfx12__)// for GPU code #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 #else // for GPU code #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0