Disable fp32 atomic adds on gfx11 (#3510)

* Disable fp32 atomic adds on gfx11

* Fixes is supported
This commit is contained in:
Bartłomiej Kocot
2026-01-08 00:32:04 +01:00
committed by GitHub
parent aad4cf0985
commit f449a5faaa
4 changed files with 55 additions and 43 deletions

View File

@@ -63,11 +63,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
#if defined(__gfx11__)
// gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
using e_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_e_grid)>>;
if constexpr(!(EGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
(std::is_same_v<e_data_type, ck::half_t> ||
std::is_same_v<e_data_type, ck::bhalf_t>)))
if constexpr(EGlobalMemoryDataOperation != InMemoryDataOperationEnum::AtomicAdd)
{
#endif
__shared__ char p_shared[GridwiseGemm::template GetSharedMemoryNumberOfByte<

View File

@@ -62,10 +62,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
#if defined(__gfx11__)
// gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_e_grid)>>;
if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
(std::is_same_v<c_data_type, ck::half_t> ||
std::is_same_v<c_data_type, ck::bhalf_t>)))
if constexpr(CGlobalMemoryDataOperation != InMemoryDataOperationEnum::AtomicAdd)
{
#endif
@@ -1028,6 +1025,17 @@ struct DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3
{
return false;
}
if(arg.k_batch_ > 1 && ck::is_gfx11_supported())
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Unsupported splitK on gfx11." << std::endl;
}
// gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
return false;
}
if constexpr(std::is_same_v<ComputeTypeA, f8_t> || std::is_same_v<ComputeTypeA, bf8_t> ||
std::is_same_v<ComputeTypeB, f8_t> || std::is_same_v<ComputeTypeB, bf8_t>)
{

View File

@@ -63,28 +63,34 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
const index_t num_k_per_block)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
#if defined(__gfx11__)
if constexpr(CGlobalMemoryDataOperation != InMemoryDataOperationEnum::AtomicAdd)
{
#endif
constexpr index_t LDS_size = GridwiseGemm::template GetSharedMemoryNumberOfByte<
typename GridwiseGemm::EpilogueCShuffle>();
__shared__ char p_shared[LDS_size];
constexpr index_t LDS_size = GridwiseGemm::template GetSharedMemoryNumberOfByte<
typename GridwiseGemm::EpilogueCShuffle>();
__shared__ char p_shared[LDS_size];
auto epilogue_args = typename GridwiseGemm::EpilogueCShuffle{};
auto epilogue_args = typename GridwiseGemm::EpilogueCShuffle{};
GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
BGridDesc_BK0_N_K1,
CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
ComputePtrOffsetOfBatch,
NumGroupsToMerge,
HasMainKBlockLoop,
CGlobalMemoryDataOperation,
TailNum>(p_shared,
a_grid_desc_ak0_m_ak1,
b_grid_desc_bk0_n_bk1,
c_grid_desc_mblock_mperblock_nblock_nperblock,
compute_ptr_offset_of_batch,
num_k_per_block,
karg,
epilogue_args);
GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
BGridDesc_BK0_N_K1,
CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
ComputePtrOffsetOfBatch,
NumGroupsToMerge,
HasMainKBlockLoop,
CGlobalMemoryDataOperation,
TailNum>(p_shared,
a_grid_desc_ak0_m_ak1,
b_grid_desc_bk0_n_bk1,
c_grid_desc_mblock_mperblock_nblock_nperblock,
compute_ptr_offset_of_batch,
num_k_per_block,
karg,
epilogue_args);
#if defined(__gfx11__)
}
#endif
#else
ignore = karg;
ignore = a_grid_desc_ak0_m_ak1;
@@ -1179,6 +1185,16 @@ struct DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3
return false;
}
if(arg.k_batch_ > 1 && ck::is_gfx11_supported())
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Unsupported splitK on gfx11." << std::endl;
}
// gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
return false;
}
// Check this here, it allows to use other instances from factory even
// if workspace is not allocated
if(!arg.p_workspace_)

View File

@@ -64,11 +64,7 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
#if defined(__gfx11__)
// gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
using e_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_e_grid)>>;
if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
(std::is_same_v<e_data_type, ck::half_t> ||
std::is_same_v<e_data_type, ck::bhalf_t>)))
if constexpr(CGlobalMemoryDataOperation != InMemoryDataOperationEnum::AtomicAdd)
{
#endif
constexpr index_t LDS_size = GridwiseGemm::template GetSharedMemoryNumberOfByte<
@@ -1089,18 +1085,14 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffleV3
return false;
}
if constexpr(std::is_same_v<CDataType, ck::half_t> ||
std::is_same_v<CDataType, ck::bhalf_t>)
if(gemm_arg.KBatch > 1 && ck::is_gfx11_supported())
{
if(gemm_arg.KBatch > 1 && ck::is_gfx11_supported())
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "Unsupported splitK on gfx11." << std::endl;
}
// gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
return false;
std::cout << "Unsupported splitK on gfx11." << std::endl;
}
// gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
return false;
}
if constexpr(std::is_same_v<ComputeTypeA, f8_t> || std::is_same_v<ComputeTypeA, bf8_t> ||