Add declarations for atomic add for fp16 and unsigned short. (#2483)

* add template for fp16 atomic add

* add template for unsigned short atomic add

* use atomicCAS in atomic add for fp16 and unsigned short

* revrt back to atomic add using casting

[ROCm/composable_kernel commit: 1b66f3f4a3]
This commit is contained in:
Illia Silin
2025-07-10 07:18:56 -07:00
committed by GitHub
parent 84a7b00497
commit e61ceee502
2 changed files with 22 additions and 22 deletions

View File

@@ -32,6 +32,22 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
return atomicAdd(p_dst, x);
}
template <>
__device__ unsigned short atomic_add<unsigned short>(unsigned short* p_dst, const unsigned short& x)
{
// Use atomicAdd with unsigned int
return static_cast<unsigned short>(
atomicAdd(reinterpret_cast<unsigned int*>(p_dst), static_cast<unsigned int>(x)));
}
template <>
__device__ _Float16 atomic_add<_Float16>(_Float16* p_dst, const _Float16& x)
{
// Use atomicAdd with unsigned int
return static_cast<_Float16>(
atomicAdd(reinterpret_cast<unsigned int*>(p_dst), static_cast<unsigned int>(x)));
}
template <>
__device__ double atomic_add<double>(double* p_dst, const double& x)
{

View File

@@ -10,27 +10,11 @@ namespace instance {
#if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
using GemmF8F8BF16InstanceVector =
std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
Col,
Row,
F8,
F8,
BF16,
PassThrough,
PassThrough,
PassThrough>>>&;
using GemmF8F8BF16InstanceVector = std::vector<std::unique_ptr<
DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&;
using GemmF8F8F16InstanceVector =
std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
Col,
Row,
F8,
F8,
F16,
PassThrough,
PassThrough,
PassThrough>>>&;
using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
GemmF8F8BF16InstanceVector& instances);
@@ -48,7 +32,7 @@ void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances
GemmF8F8BF16InstanceVector& instances);
void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances(
GemmF8F8BF16InstanceVector& instances);
GemmF8F8BF16InstanceVector& instances);
void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances(
GemmF8F8BF16InstanceVector& instances);
@@ -84,7 +68,7 @@ void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_defau
GemmF8F8F16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
GemmF8F8F16InstanceVector& instances);
GemmF8F8F16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances(
GemmF8F8F16InstanceVector& instances);
void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(