Add v3 support for Groupd fwd conv+bias+clamp & ckProfiler (#2463)

* Add logging to IsSupported.

* Less casting in AddClamp

* Conv+bias+clamp instances & profiler BF16

* Fix 3D instances & run just 1x for verification.

* :Run just once for verification conv fwd.

* ckProfiler conv fwd clampwq

* Remove exec bit & formatting

* Add support for MultiD for grouped conv fwd v3.

* Enable 2Lds.

* clean

* align instances

* align instances

* profiler fixes

* Fixes

* fix

* fix

---------

Co-authored-by: Adam Osewski <root@quanta-ccs-aus-f01-19.cs-aus.dcgpu>
Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
This commit is contained in:
Adam Osewski
2025-07-25 10:34:31 +02:00
committed by GitHub
parent b01a27ff22
commit c8eb2f995c
6 changed files with 1098 additions and 301 deletions

View File

@@ -379,10 +379,10 @@ struct AddClamp
__host__ __device__ constexpr void
operator()<half_t, half_t, half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{
const half_t a = x0 + x1;
y = a > type_convert<half_t>(floor_)
? (a < type_convert<half_t>(ceil_) ? a : type_convert<half_t>(ceil_))
: type_convert<half_t>(floor_);
const half_t floor = type_convert<half_t>(floor_);
const half_t ceil = type_convert<half_t>(ceil_);
const half_t a = x0 + x1;
y = a > floor ? (a < ceil ? a : ceil) : floor;
};
template <>

View File

@@ -143,7 +143,8 @@ template <typename ALayout,
typename ComputeTypeA = CDataType,
typename ComputeTypeB = ComputeTypeA,
typename LDSTypeA = ADataType,
typename LDSTypeB = BDataType>
typename LDSTypeB = BDataType,
bool DoElementwiseBeforeCShuffle = false>
struct GridwiseGemmMultiD_xdl_cshuffle_v3
{
static constexpr auto I0 = Number<0>{};
@@ -466,6 +467,12 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
{
return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
}
else
{
static_assert(false,
"The layout configuration is not supported! "
"Only support Row & Col major.");
}
}();
// pad M and N
@@ -538,8 +545,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
Number<NumDTensor>{});
}
using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N(0, 0, 0, 0, {}))>;
struct Problem
{
__host__ __device__ Problem() = default;
@@ -1245,11 +1250,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
template <bool HasMainKBlockLoop,
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
TailNumber TailNum = TailNumber::Odd>
__device__ static void Run(const ADataType* p_a_grid,
const BDataType* p_b_grid,
__device__ static void Run(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
DsGridPointer& p_ds_grid,
CDataType* p_c_grid,
void* p_shared,
CDataType* __restrict__ p_c_grid,
void* __restrict__ p_shared,
const Problem& problem,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
@@ -1273,11 +1278,11 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
bool HasMainKBlockLoop,
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
TailNumber TailNum = TailNumber::Odd>
__device__ static void Run(const ADataType* p_a_grid,
const BDataType* p_b_grid,
__device__ static void Run(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
DsGridPointer& p_ds_grid,
CDataType* p_c_grid,
void* p_shared,
CDataType* __restrict__ p_c_grid,
void* __restrict__ p_shared,
const Problem& problem,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
@@ -1288,17 +1293,62 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
c_grid_desc_m_n, problem.MBlock, problem.NBlock);
Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum, Block2CTileMap>(
p_a_grid,
p_b_grid,
p_ds_grid,
p_c_grid,
p_shared,
problem,
a_element_op,
b_element_op,
c_element_op,
block_2_ctile_map,
a_grid_desc_ak0_m_ak1,
b_grid_desc_bk0_n_bk1,
ds_grid_desc_m_n,
c_grid_desc_m_n);
}
template <bool HasMainKBlockLoop,
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
TailNumber TailNum,
typename Block2CTileMap,
typename AGridDesc_AK0_M_K1,
typename BGridDesc_BK0_N_K1,
typename DsGridDesc_M_N,
typename CGridDesc_M_N>
__device__ static void Run(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
DsGridPointer& p_ds_grid,
CDataType* __restrict__ p_c_grid,
void* __restrict__ p_shared,
const Problem& problem,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op,
const Block2CTileMap& block_2_ctile_map,
const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
const DsGridDesc_M_N& ds_grid_desc_m_n,
const CGridDesc_M_N& c_grid_desc_m_n)
{
const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
c_grid_desc_m_n, problem.MBlock, problem.NBlock);
auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
@@ -1515,43 +1565,63 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
make_multi_index(n_thread_data_on_block));
tensor_operation::element_wise::PassThrough pass_through{};
const auto& vpgr_to_lds_element_op = [&] {
if constexpr(DoElementwiseBeforeCShuffle)
{
return c_element_op;
}
else
{
return pass_through;
}
};
const auto& lds_to_global_element_op = [&] {
if constexpr(!DoElementwiseBeforeCShuffle)
{
return c_element_op;
}
else
{
return pass_through;
}
};
// shuffle: threadwise copy C from VGPR to LDS
auto c_thread_copy_vgpr_to_lds =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
CShuffleDataType,
decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
ck::tensor_operation::element_wise::PassThrough,
Sequence<CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
I1,
I1,
M2,
I1,
M4,
I1>,
Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
7,
1,
InMemoryDataOperationEnum::Set,
1,
true>{
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
make_multi_index(0,
0,
m_thread_data_on_block_idx[I1],
n_thread_data_on_block_idx[I1],
m_thread_data_on_block_idx[I2],
m_thread_data_on_block_idx[I3],
m_thread_data_on_block_idx[I4],
n_thread_data_on_block_idx[I2]),
ck::tensor_operation::element_wise::PassThrough{}};
auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
AccDataType,
CShuffleDataType,
decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
conditional_t<DoElementwiseBeforeCShuffle,
CElementwiseOperation,
tensor_operation::element_wise::PassThrough>,
Sequence<CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
I1,
I1,
M2,
I1,
M4,
I1>,
Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
7,
1,
InMemoryDataOperationEnum::Set,
1,
true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
make_multi_index(0,
0,
m_thread_data_on_block_idx[I1],
n_thread_data_on_block_idx[I1],
m_thread_data_on_block_idx[I2],
m_thread_data_on_block_idx[I3],
m_thread_data_on_block_idx[I4],
n_thread_data_on_block_idx[I2]),
vpgr_to_lds_element_op()};
using EDataType = CDataType;
const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
@@ -1601,7 +1671,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
Tuple<EDataType>,
decltype(c_ds_desc_refs),
decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
CElementwiseOperation,
conditional_t<!DoElementwiseBeforeCShuffle,
CElementwiseOperation,
tensor_operation::element_wise::PassThrough>,
Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
// support arbitray type
Sequence<1,
@@ -1625,7 +1697,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
idx_c_ds_block_begin,
tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
make_tuple(make_multi_index(block_m_id, 0, block_n_id, 0)),
c_element_op};
lds_to_global_element_op()};
// space filling curve for threadwise C in VGPR
constexpr auto sfc_c_vgpr =
@@ -1698,12 +1770,12 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
template <bool HasMainKBlockLoop,
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
TailNumber TailNum = TailNumber::Odd>
__device__ static void Run_2Lds(const ADataType* p_a_grid,
const BDataType* p_b_grid,
__device__ static void Run_2Lds(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
DsGridPointer& p_ds_grid,
CDataType* p_c_grid,
void* p_shared_0,
void* p_shared_1,
CDataType* __restrict__ p_c_grid,
void* __restrict__ p_shared_0,
void* __restrict__ p_shared_1,
const Problem& problem,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
@@ -1729,12 +1801,12 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
bool HasMainKBlockLoop,
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
TailNumber TailNum = TailNumber::Odd>
__device__ static void Run_2Lds(const ADataType* p_a_grid,
const BDataType* p_b_grid,
__device__ static void Run_2Lds(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
DsGridPointer& p_ds_grid,
CDataType* p_c_grid,
void* p_shared_0,
void* p_shared_1,
CDataType* __restrict__ p_c_grid,
void* __restrict__ p_shared_0,
void* __restrict__ p_shared_1,
const Problem& problem,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
@@ -1745,8 +1817,53 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(p_a_grid,
p_b_grid,
p_ds_grid,
p_c_grid,
p_shared_0,
p_shared_1,
problem,
a_element_op,
b_element_op,
c_element_op,
block_2_ctile_map,
a_grid_desc_ak0_m_ak1,
b_grid_desc_bk0_n_bk1,
ds_grid_desc_m_n,
c_grid_desc_m_n);
}
template <bool HasMainKBlockLoop,
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
TailNumber TailNum,
typename Block2CTileMap,
typename AGridDesc_AK0_M_K1,
typename BGridDesc_BK0_N_K1,
typename DsGridDesc_M_N,
typename CGridDesc_M_N>
__device__ static void Run_2Lds(const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
DsGridPointer& p_ds_grid,
CDataType* __restrict__ p_c_grid,
void* __restrict__ p_shared_0,
void* __restrict__ p_shared_1,
const Problem& problem,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op,
const Block2CTileMap& block_2_ctile_map,
const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_K1& b_grid_desc_bk0_n_bk1,
const DsGridDesc_M_N& ds_grid_desc_m_n,
const CGridDesc_M_N& c_grid_desc_m_n)
{
const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
@@ -1982,43 +2099,63 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
make_multi_index(n_thread_data_on_block));
tensor_operation::element_wise::PassThrough pass_through{};
const auto& vpgr_to_lds_element_op = [&] {
if constexpr(DoElementwiseBeforeCShuffle)
{
return c_element_op;
}
else
{
return pass_through;
}
};
const auto& lds_to_global_element_op = [&] {
if constexpr(!DoElementwiseBeforeCShuffle)
{
return c_element_op;
}
else
{
return pass_through;
}
};
// shuffle: threadwise copy C from VGPR to LDS
auto c_thread_copy_vgpr_to_lds =
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
CShuffleDataType,
decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
ck::tensor_operation::element_wise::PassThrough,
Sequence<CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
I1,
I1,
M2,
I1,
M4,
I1>,
Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
7,
1,
InMemoryDataOperationEnum::Set,
1,
true>{
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
make_multi_index(0,
0,
m_thread_data_on_block_idx[I1],
n_thread_data_on_block_idx[I1],
m_thread_data_on_block_idx[I2],
m_thread_data_on_block_idx[I3],
m_thread_data_on_block_idx[I4],
n_thread_data_on_block_idx[I2]),
ck::tensor_operation::element_wise::PassThrough{}};
auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
AccDataType,
CShuffleDataType,
decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
conditional_t<DoElementwiseBeforeCShuffle,
CElementwiseOperation,
tensor_operation::element_wise::PassThrough>,
Sequence<CShuffleMXdlPerWavePerShuffle,
CShuffleNXdlPerWavePerShuffle,
I1,
I1,
M2,
I1,
M4,
I1>,
Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
7,
1,
InMemoryDataOperationEnum::Set,
1,
true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
make_multi_index(0,
0,
m_thread_data_on_block_idx[I1],
n_thread_data_on_block_idx[I1],
m_thread_data_on_block_idx[I2],
m_thread_data_on_block_idx[I3],
m_thread_data_on_block_idx[I4],
n_thread_data_on_block_idx[I2]),
vpgr_to_lds_element_op()};
using EDataType = CDataType;
const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
@@ -2068,7 +2205,9 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
Tuple<EDataType>,
decltype(c_ds_desc_refs),
decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
CElementwiseOperation,
conditional_t<!DoElementwiseBeforeCShuffle,
CElementwiseOperation,
tensor_operation::element_wise::PassThrough>,
Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
// support arbitray type
Sequence<1,
@@ -2092,7 +2231,7 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
idx_c_ds_block_begin,
tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
make_tuple(make_multi_index(block_m_id, 0, block_n_id, 0)),
c_element_op};
lds_to_global_element_op()};
// space filling curve for threadwise C in VGPR
constexpr auto sfc_c_vgpr =