Revert "Add atomic add fallback method for gfx11"

This reverts commit 07a79e797d. Signed-off-by: Tianyuan Wu <Tianyuan.Wu@amd.com>
2026-07-01 04:07:56 +00:00 · 2025-08-07 03:53:49 +00:00
parent 2811a4e1de
commit 2a8fedef23
2 changed files with 1 additions and 58 deletions
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2160,52 +2160,6 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer<T, N>& dst_thr
    }
 }

-#if defined(__gfx11__)
-template <typename T, typename ComputeType>
-extern CK_TILE_HOST_DEVICE T add(const T& a, const T& b);
-
-template <typename T, index_t N>
-CK_TILE_DEVICE void atomic_add_impl(T* p_dst, const thread_buffer<T, N>& x)
-{
-    static_assert(std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8),
-                  "wrong! not implemented");
-
-    static_for<0, N / 2, 1>{}([&](auto i) {
-        union U32F162_ADDR
-        {
-            uint32_t* u32_a;
-            fp16x2_t* f162_a;
-        };
-
-        union U32F162
-        {
-            uint32_t u32;
-            fp16x2_t f162;
-        };
-
-        U32F162_ADDR dword_addr;
-        U32F162 cur_v;
-        U32F162 new_;
-        uint32_t old_v, new_v;
-        dword_addr.f162_a = c_style_pointer_cast<fp16x2_t*>(p_dst) + i;
-        cur_v.u32         = *dword_addr.u32_a;
-
-        do
-        {
-            old_v     = cur_v.u32;
-            new_.f162 = [](const fp16x2_t& a, const fp16x2_t& b) {
-                fp16x2_t rtn;
-                rtn[0] = add<fp16_t, float>(a[0], b[0]);
-                rtn[1] = add<fp16_t, float>(a[1], b[1]);
-                return rtn;
-            }(cur_v.f162, x.template get_as<fp16x2_t>()[i]);
-            new_v     = new_.u32;
-            cur_v.u32 = atomicCAS(dword_addr.u32_a, old_v, new_v);
-        } while(cur_v.u32 != old_v);
-    });
-}
-#endif
-
 template <typename T, index_t N>
 CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer<T, N>& src_thread_data,
                                               int32x4_t dst_wave_buffer_resource,
@@ -2712,17 +2666,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_
                                          const bool dst_thread_element_valid,
                                          const index_t dst_element_space_size)
 {
-#if defined(__gfx11__)
-    if constexpr(std::is_same<T, fp16_t>::value)
-    {
-        // gfx11 does not support buffer atomic add in fp16 format.
-        // Thus, call the fallback method instead
-        ignore = dst_thread_element_valid;
-        ignore = dst_element_space_size;
-        atomic_add_impl<T, N>(&p_dst_wave[dst_thread_element_offset], src_thread_data);
-        return;
-    }
-#endif
    const int32x4_t dst_wave_buffer_resource =
        make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));

--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -152,7 +152,7 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__) // for GPU code
+#elif defined(__gfx9__) || defined(__gfx12__) // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0