diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 31f4eddfc4..29cc3fefe5 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -2160,52 +2160,6 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer& dst_thr } } -#if defined(__gfx11__) -template -extern CK_TILE_HOST_DEVICE T add(const T& a, const T& b); - -template -CK_TILE_DEVICE void atomic_add_impl(T* p_dst, const thread_buffer& x) -{ - static_assert(std::is_same::value && (N == 2 || N == 4 || N == 8), - "wrong! not implemented"); - - static_for<0, N / 2, 1>{}([&](auto i) { - union U32F162_ADDR - { - uint32_t* u32_a; - fp16x2_t* f162_a; - }; - - union U32F162 - { - uint32_t u32; - fp16x2_t f162; - }; - - U32F162_ADDR dword_addr; - U32F162 cur_v; - U32F162 new_; - uint32_t old_v, new_v; - dword_addr.f162_a = c_style_pointer_cast(p_dst) + i; - cur_v.u32 = *dword_addr.u32_a; - - do - { - old_v = cur_v.u32; - new_.f162 = [](const fp16x2_t& a, const fp16x2_t& b) { - fp16x2_t rtn; - rtn[0] = add(a[0], b[0]); - rtn[1] = add(a[1], b[1]); - return rtn; - }(cur_v.f162, x.template get_as()[i]); - new_v = new_.u32; - cur_v.u32 = atomicCAS(dword_addr.u32_a, old_v, new_v); - } while(cur_v.u32 != old_v); - }); -} -#endif - template CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer& src_thread_data, int32x4_t dst_wave_buffer_resource, @@ -2712,17 +2666,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer& src_thread_ const bool dst_thread_element_valid, const index_t dst_element_space_size) { -#if defined(__gfx11__) - if constexpr(std::is_same::value) - { - // gfx11 does not support buffer atomic add in fp16 format. - // Thus, call the fallback method instead - ignore = dst_thread_element_valid; - ignore = dst_element_space_size; - atomic_add_impl(&p_dst_wave[dst_thread_element_offset], src_thread_data); - return; - } -#endif const int32x4_t dst_wave_buffer_resource = make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 29a8dcd0d1..5f83c97968 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -152,7 +152,7 @@ // buffer atomic add: floating point #ifndef __HIP_DEVICE_COMPILE__ // for host code #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 -#elif defined(__gfx9__) || defined(__gfx11__) || defined(__gfx12__) // for GPU code +#elif defined(__gfx9__) || defined(__gfx12__) // for GPU code #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1 #else // for GPU code #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0