// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #pragma once #include "data_type.hpp" #include "dtype_fp64.hpp" namespace ck { // Caution: DO NOT REMOVE // intentionally have only declaration but no definition to cause compilation failure when trying to // instantiate this template. The purpose is to make the implementation of atomic_add explicit for // each datatype. template __device__ X atomic_add(X* p_dst, const X& x); template <> __device__ int32_t atomic_add(int32_t* p_dst, const int32_t& x) { return atomicAdd(p_dst, x); } template <> __device__ uint32_t atomic_add(uint32_t* p_dst, const uint32_t& x) { return atomicAdd(p_dst, x); } template <> __device__ float atomic_add(float* p_dst, const float& x) { return atomicAdd(p_dst, x); } template <> __device__ unsigned short atomic_add(unsigned short* p_dst, const unsigned short& x) { // Use atomicAdd with unsigned int return static_cast( atomicAdd(reinterpret_cast(p_dst), static_cast(x))); } template <> __device__ _Float16 atomic_add<_Float16>(_Float16* p_dst, const _Float16& x) { // Use atomicAdd with unsigned int return static_cast<_Float16>( atomicAdd(reinterpret_cast(p_dst), static_cast(x))); } template <> __device__ double atomic_add(double* p_dst, const double& x) { return atomicAdd(p_dst, x); } template <> __device__ float2_t atomic_add(float2_t* p_dst, const float2_t& x) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; const vector_type vx{x}; vector_type vy{0}; vy.template AsType()(I0) = atomicAdd(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); vy.template AsType()(I1) = atomicAdd(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); return vy.template AsType()[I0]; } template <> __device__ float4_t atomic_add(float4_t* p_dst, const float4_t& x) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; const vector_type vx{x}; vector_type vy{0}; vy.template AsType()(I0) = atomicAdd(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); vy.template AsType()(I1) = atomicAdd(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); vy.template AsType()(I2) = atomicAdd(c_style_pointer_cast(p_dst) + 2, vx.template AsType()[I2]); vy.template AsType()(I3) = atomicAdd(c_style_pointer_cast(p_dst) + 3, vx.template AsType()[I3]); return vy.template AsType()[I0]; } template <> __device__ double2_t atomic_add(double2_t* p_dst, const double2_t& x) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; const vector_type vx{x}; vector_type vy{0}; vy.template AsType()(I0) = atomicAdd(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); vy.template AsType()(I1) = atomicAdd(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); return vy.template AsType()[I0]; } #if defined(__gfx11__) template <> __device__ float8_t atomic_add(float8_t* p_dst, const float8_t& x) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr auto I4 = Number<4>{}; constexpr auto I5 = Number<5>{}; constexpr auto I6 = Number<6>{}; constexpr auto I7 = Number<7>{}; const vector_type vx{x}; vector_type vy{0}; vy.template AsType()(I0) = atomicAdd(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); vy.template AsType()(I1) = atomicAdd(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); vy.template AsType()(I2) = atomicAdd(c_style_pointer_cast(p_dst) + 2, vx.template AsType()[I2]); vy.template AsType()(I3) = atomicAdd(c_style_pointer_cast(p_dst) + 3, vx.template AsType()[I3]); vy.template AsType()(I4) = atomicAdd(c_style_pointer_cast(p_dst) + 4, vx.template AsType()[I4]); vy.template AsType()(I5) = atomicAdd(c_style_pointer_cast(p_dst) + 5, vx.template AsType()[I5]); vy.template AsType()(I6) = atomicAdd(c_style_pointer_cast(p_dst) + 6, vx.template AsType()[I6]); vy.template AsType()(I7) = atomicAdd(c_style_pointer_cast(p_dst) + 7, vx.template AsType()[I7]); return vy.template AsType()[I0]; } template <> __device__ half4_t atomic_add(half4_t* p_dst, const half4_t& x) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; const vector_type vx{x}; vector_type vy{0}; vy.template AsType()(I0) = atomic_add(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); vy.template AsType()(I1) = atomic_add(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); vy.template AsType()(I2) = atomic_add(c_style_pointer_cast(p_dst) + 2, vx.template AsType()[I2]); vy.template AsType()(I3) = atomic_add(c_style_pointer_cast(p_dst) + 3, vx.template AsType()[I3]); return vy.template AsType()[I0]; } template <> __device__ half8_t atomic_add(half8_t* p_dst, const half8_t& x) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; constexpr auto I4 = Number<4>{}; constexpr auto I5 = Number<5>{}; constexpr auto I6 = Number<6>{}; constexpr auto I7 = Number<7>{}; const vector_type vx{x}; vector_type vy{0}; vy.template AsType()(I0) = atomic_add(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); vy.template AsType()(I1) = atomic_add(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); vy.template AsType()(I2) = atomic_add(c_style_pointer_cast(p_dst) + 2, vx.template AsType()[I2]); vy.template AsType()(I3) = atomic_add(c_style_pointer_cast(p_dst) + 3, vx.template AsType()[I3]); vy.template AsType()(I4) = atomic_add(c_style_pointer_cast(p_dst) + 4, vx.template AsType()[I4]); vy.template AsType()(I5) = atomic_add(c_style_pointer_cast(p_dst) + 5, vx.template AsType()[I5]); vy.template AsType()(I6) = atomic_add(c_style_pointer_cast(p_dst) + 6, vx.template AsType()[I6]); vy.template AsType()(I7) = atomic_add(c_style_pointer_cast(p_dst) + 7, vx.template AsType()[I7]); return vy.template AsType()[I0]; } #endif // defined(__gfx11__) // Caution: DO NOT REMOVE // intentionally have only declaration but no definition to cause compilation failure when trying to // instantiate this template. The purpose is to make the implementation of atomic_max explicit for // each datatype. template __device__ X atomic_max(X* p_dst, const X& x); template <> __device__ int32_t atomic_max(int32_t* p_dst, const int32_t& x) { return atomicMax(p_dst, x); } template <> __device__ uint32_t atomic_max(uint32_t* p_dst, const uint32_t& x) { return atomicMax(p_dst, x); } template <> __device__ float atomic_max(float* p_dst, const float& x) { return atomicMax(p_dst, x); } template <> __device__ double atomic_max(double* p_dst, const double& x) { return atomicMax(p_dst, x); } template <> __device__ float2_t atomic_max(float2_t* p_dst, const float2_t& x) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; const vector_type vx{x}; vector_type vy{0}; vy.template AsType()(I0) = atomicMax(c_style_pointer_cast(p_dst), vx.template AsType()[I0]); vy.template AsType()(I1) = atomicMax(c_style_pointer_cast(p_dst) + 1, vx.template AsType()[I1]); return vy.template AsType()[I0]; } } // namespace ck